Merge branch 'x86-build-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Mar 2016 17:16:48 +0000 (10:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Mar 2016 17:16:48 +0000 (10:16 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Mar 2016 17:16:48 +0000 (10:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Mar 2016 17:16:48 +0000 (10:16 -0700)
diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt

index a2bd593881cab361fa739d9a12e63ee7ba63ef50..66422d6631841d9e6eba9be63e453176ebd43f2e 100644 (file)
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -23,6 +23,7 @@ Optional properties:
    during suspend.
  - ti,no-reset-on-init: When present, the module should not be reset at init
  - ti,no-idle-on-init: When present, the module should not be idled at init
+- ti,no-idle: When present, the module is never allowed to idle.
  
  Example:
  
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 9a53c929f017d16527270bc2244352edf1d34cd8..084775f7b052de0f9998ffa7bfa2e48080f1aec1 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  
         clearcpuid=BITNUM [X86]
                         Disable CPUID feature X for the kernel. See
-                       arch/x86/include/asm/cpufeature.h for the valid bit
+                       arch/x86/include/asm/cpufeatures.h for the valid bit
                         numbers. Note the Linux specific bits are not necessarily
                         stable over kernel options, but the vendor specific
                         ones should be.
@@ -3491,6 +3491,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  
         ro              [KNL] Mount root device read-only on boot
  
+       rodata=         [KNL]
+               on      Mark read-only kernel memory as read-only (default).
+               off     Leave read-only kernel memory writable for debugging.
+
         root=           [KNL] Root filesystem
                         See name_to_dev_t comment in init/do_mounts.c.
  
@@ -3528,6 +3532,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  
         sched_debug     [KNL] Enables verbose scheduler debug messages.
  
+       schedstats=     [KNL,X86] Enable or disable scheduled statistics.
+                       Allowed values are enable and disable. This feature
+                       incurs a small amount of overhead in the scheduler
+                       but is useful for debugging and performance tuning.
+
         skew_tick=      [KNL] Offset the periodic timer tick per cpu to mitigate
                         xtime_lock contention on larger systems, and/or RCU lock
                         contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index a93b414672a71ac6fa9bac1e848215804bde139c..f4444c94ff285db493961c54a1527a189da20fd2 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -58,6 +58,8 @@ show up in /proc/sys/kernel:
  - panic_on_stackoverflow
  - panic_on_unrecovered_nmi
  - panic_on_warn
+- perf_cpu_time_max_percent
+- perf_event_paranoid
  - pid_max
  - powersave-nap               [ PPC only ]
  - printk
@@ -639,6 +641,17 @@ allowed to execute.
  
  ==============================================================
  
+perf_event_paranoid:
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN).  The default value is 1.
+
+ -1: Allow use of (almost) all events by all users
+>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
+>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+
+==============================================================
  
  pid_max:
  
@@ -760,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued.
  
  ==============================================================
  
+sched_schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
  sg-big-buff:
  
  This file shows the size of the generic SCSI (sg) buffer.
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt

index daf9c0f742d22e882637391539e90ea31eb788f3..c81731096a4338bcec4d43b8049a26118d10f260 100644 (file)
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -358,7 +358,8 @@ In the first case there are two additional complications:
  - if CR4.SMEP is enabled: since we've turned the page into a kernel page,
    the kernel may now execute it.  We handle this by also setting spte.nx.
    If we get a user fetch or read fault, we'll change spte.u=1 and
-  spte.nx=gpte.nx back.
+  spte.nx=gpte.nx back.  For this to work, KVM forces EFER.NX to 1 when
+  shadow paging is in use.
  - if CR4.SMAP is disabled: since the page has been changed to a kernel
    page, it can not be reused when CR4.SMAP is enabled. We set
    CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt

index 32901aa36f0a99f7c087aa7c510bbceb0ec14a89..e396bcd8d830428a30231986fd6f1a2ee9bdd921 100644 (file)
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
  only use exceptions for code in the .text section.  Any other section
  will cause the exception table to not be sorted correctly, and the
  exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+       .long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+       ex_insn_addr(const struct exception_table_entry *x)
+       {
+               return (unsigned long)&x->insn + x->insn;
+       }
+
+In v4.6 the exception table entry was expanded with a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt

index 68ed3114c363bf7332b2515b156bf0953ca88d24..0965a71f994243e70b3ae04fe78f73263666c1af 100644 (file)
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -60,6 +60,8 @@ Machine check
                 threshold to 1. Enabling this may make memory predictive failure
                 analysis less effective if the bios sets thresholds for memory
                 errors since we will not see details for all errors.
+   mce=recovery
+               Force-enable recoverable machine check code paths
  
     nomce (for compatibility with i386): same as mce=off
  
diff --git a/MAINTAINERS b/MAINTAINERS

index 4029c63d8a7d4dd68bebf7f923755a9904411ae3..2061ea77667c36260a72f92ba12c3ecdb625d3c6 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7399,6 +7399,17 @@ W:       https://www.myricom.com/support/downloads/myri10ge.html
  S:     Supported
  F:     drivers/net/ethernet/myricom/myri10ge/
  
+NAND FLASH SUBSYSTEM
+M:     Boris Brezillon <boris.brezillon@free-electrons.com>
+R:     Richard Weinberger <richard@nod.at>
+L:     linux-mtd@lists.infradead.org
+W:     http://www.linux-mtd.infradead.org/
+Q:     http://patchwork.ozlabs.org/project/linux-mtd/list/
+T:     git git://github.com/linux-nand/linux.git
+S:     Maintained
+F:     drivers/mtd/nand/
+F:     include/linux/mtd/nand*.h
+
  NATSEMI ETHERNET DRIVER (DP8381x)
  S:     Orphan
  F:     drivers/net/ethernet/natsemi/natsemi.c
@@ -8464,6 +8475,7 @@ PERFORMANCE EVENTS SUBSYSTEM
  M:     Peter Zijlstra <peterz@infradead.org>
  M:     Ingo Molnar <mingo@redhat.com>
  M:     Arnaldo Carvalho de Melo <acme@kernel.org>
+R:     Alexander Shishkin <alexander.shishkin@linux.intel.com>
  L:     linux-kernel@vger.kernel.org
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
  S:     Supported
diff --git a/Makefile b/Makefile

index 2d519d2fb3a9be1a64100c85141b824a37924602..7b3ecdcdc6c1815ca6241a72b1967593a0335b41 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 4
  PATCHLEVEL = 5
  SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
  NAME = Blurry Fish Butt
  
  # *DOCUMENTATION*
diff --git a/arch/arm/boot/dts/armada-xp-axpwifiap.dts b/arch/arm/boot/dts/armada-xp-axpwifiap.dts

index 23fc670c0427710168ce41ef012f9e37b8218eb6..5c21b236721fcb9d6e2b5055cbc89ae9527df7f1 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-axpwifiap.dts
+++ b/arch/arm/boot/dts/armada-xp-axpwifiap.dts
@@ -70,8 +70,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-db.dts b/arch/arm/boot/dts/armada-xp-db.dts

index f774101416a5522841c475252d0a86842e475b29..ebe1d267406df5ab30e3a3189b669733eb8fcaa4 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-db.dts
+++ b/arch/arm/boot/dts/armada-xp-db.dts
@@ -76,8 +76,8 @@
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
                           MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 devbus-bootcs {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-gp.dts b/arch/arm/boot/dts/armada-xp-gp.dts

index 4878d7353069fc2720e15d76af307618daced15c..5730b875c4f51a1aa2743d8881b18d6dbc0b27cd 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-gp.dts
+++ b/arch/arm/boot/dts/armada-xp-gp.dts
@@ -95,8 +95,8 @@
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
                           MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 devbus-bootcs {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts

index fb9e1bbf23385b85b0b82ddb153b922b2d2e0178..8af463f26ea1e162360952dbc03c0a9bda807cd5 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
+++ b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
@@ -65,8 +65,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                         MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                       MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                       MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                       MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                       MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts

index 6e9820e141f8de5a850edcd7e74f0ea2a1acd1ed..b89e6cf1271a1370aead2a000e729cefad376ab0 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
+++ b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
@@ -70,8 +70,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-matrix.dts b/arch/arm/boot/dts/armada-xp-matrix.dts

index 6ab33837a2b6d0a5aaf4e48763bb838451a346bd..6522b04f4a8e7c7a759100153d7d990f1fb160d3 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-matrix.dts
+++ b/arch/arm/boot/dts/armada-xp-matrix.dts
@@ -68,8 +68,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 internal-regs {
                         serial@12000 {
diff --git a/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts b/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts

index 62175a8848bc2234343a8627d7cdf0ababfee851..d19f44c709257d9a35644d71cb49a04e354c42d8 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
+++ b/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
@@ -64,8 +64,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts b/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts

index a5db17782e085662d01a22683c5c364be5c25c8c..853bd392a4fe20155ed1469a23175213814ca9dc 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
+++ b/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
@@ -65,9 +65,9 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x8000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x01, 0x2f) 0 0 0xe8000000 0x8000000
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 devbus-bootcs {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-synology-ds414.dts b/arch/arm/boot/dts/armada-xp-synology-ds414.dts

index 2391b11dc546b859dd3ab23a700be5a8a63ea83b..d17dab0a6f513e9c04ff2675f5f9d44d7f6953b3 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-synology-ds414.dts
+++ b/arch/arm/boot/dts/armada-xp-synology-ds414.dts
@@ -78,8 +78,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi

index c4d9175b90dceab5aa64ca1973505652bf325bee..f82aa44c3ceed540fef7e291c571c2b49c5b2e1f 100644 (file)
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -1500,6 +1500,16 @@
                                0x48485200 0x2E00>;
                         #address-cells = <1>;
                         #size-cells = <1>;
+
+                       /*
+                        * Do not allow gating of cpsw clock as workaround
+                        * for errata i877. Keeping internal clock disabled
+                        * causes the device switching characteristics
+                        * to degrade over time and eventually fail to meet
+                        * the data manual delay time/skew specs.
+                        */
+                       ti,no-idle;
+
                         /*
                          * rx_thresh_pend
                          * rx_pend
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h

index d5525bfc7e3e61879d278ae08b446e185c206982..9156fc303afd8d278671c6d4d277d866fdd2ad7b 100644 (file)
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
  #endif
  
  #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
  #else
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index 7d0cba6f1cc5efadff31fd0cde2aca3279e120da..139791ed473d5264682c004ea2ea7af8ddd28f5d 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -176,13 +176,13 @@ static struct resource mem_res[] = {
                 .name = "Kernel code",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         },
         {
                 .name = "Kernel data",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         }
  };
  
@@ -851,7 +851,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
                 res->name  = "System RAM";
                 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
                 request_resource(&iomem_resource, res);
  
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index dda1959f0ddeb947e8a8020d7da0b02bb19f89cc..08e49c423c24147a2f2c40b011866c8d2d144de9 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
         struct kvm_vcpu *vcpu;
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
-               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
  
                 vcpu->arch.pause = false;
-               wake_up_interruptible(wq);
+               swake_up(wq);
         }
  }
  
  static void vcpu_sleep(struct kvm_vcpu *vcpu)
  {
-       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
  
-       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
                                        (!vcpu->arch.pause)));
  }
  
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c

index a9b3b905e661dec55672e459f1119b3eb466b373..c2b131527a643f6e6808b78f8c53e9751d0f71a9 100644 (file)
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
  {
         struct kvm *kvm = source_vcpu->kvm;
         struct kvm_vcpu *vcpu = NULL;
-       wait_queue_head_t *wq;
+       struct swait_queue_head *wq;
         unsigned long cpu_id;
         unsigned long context_id;
         phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
         smp_mb();               /* Make sure the above is visible */
  
         wq = kvm_arch_vcpu_wq(vcpu);
-       wake_up_interruptible(wq);
+       swake_up(wq);
  
         return PSCI_RET_SUCCESS;
  }
diff --git a/arch/arm/mach-lpc32xx/phy3250.c b/arch/arm/mach-lpc32xx/phy3250.c

index 77d6b1bab278292e31a6435294e824a6a76648b9..ee06fabdf60e69e014317c9703787da5f31dc79f 100644 (file)
--- a/arch/arm/mach-lpc32xx/phy3250.c
+++ b/arch/arm/mach-lpc32xx/phy3250.c
@@ -86,8 +86,8 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
  {
         dma_addr_t dma;
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-               PANEL_SIZE, &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, PANEL_SIZE, &dma,
+                                         GFP_KERNEL);
         if (!fb->fb.screen_base) {
                 printk(KERN_ERR "CLCD: unable to map framebuffer\n");
                 return -ENOMEM;
@@ -116,15 +116,14 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
  
  static int lpc32xx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-               fb->fb.screen_base, fb->fb.fix.smem_start,
-               fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  static void lpc32xx_clcd_remove(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-               fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
  
  /*
diff --git a/arch/arm/mach-netx/fb.c b/arch/arm/mach-netx/fb.c

index d122ee6ab9913aa8e8f2896ea0fee44a01832068..8814ee5e98fd52ff75412fd863542954037dafb8 100644 (file)
--- a/arch/arm/mach-netx/fb.c
+++ b/arch/arm/mach-netx/fb.c
@@ -42,8 +42,8 @@ int netx_clcd_setup(struct clcd_fb *fb)
  
         fb->panel = netx_panel;
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, 1024*1024,
-                                                   &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, 1024 * 1024, &dma,
+                                         GFP_KERNEL);
         if (!fb->fb.screen_base) {
                 printk(KERN_ERR "CLCD: unable to map framebuffer\n");
                 return -ENOMEM;
@@ -57,16 +57,14 @@ int netx_clcd_setup(struct clcd_fb *fb)
  
  int netx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-                                    fb->fb.screen_base,
-                                    fb->fb.fix.smem_start,
-                                    fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  void netx_clcd_remove(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-                             fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
  
  static AMBA_AHB_DEVICE(fb, "fb", 0, 0x00104000, { NETX_IRQ_LCD }, NULL);
diff --git a/arch/arm/mach-nspire/clcd.c b/arch/arm/mach-nspire/clcd.c

index abea12617b173e2b34f794c4bdf98b8b5d518c48..ea0e5b2ca1cd49e6328faccbb467c1d7b669430f 100644 (file)
--- a/arch/arm/mach-nspire/clcd.c
+++ b/arch/arm/mach-nspire/clcd.c
@@ -90,8 +90,8 @@ int nspire_clcd_setup(struct clcd_fb *fb)
         panel_size = ((panel->mode.xres * panel->mode.yres) * panel->bpp) / 8;
         panel_size = ALIGN(panel_size, PAGE_SIZE);
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-               panel_size, &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, panel_size, &dma,
+                                         GFP_KERNEL);
  
         if (!fb->fb.screen_base) {
                 pr_err("CLCD: unable to map framebuffer\n");
@@ -107,13 +107,12 @@ int nspire_clcd_setup(struct clcd_fb *fb)
  
  int nspire_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-               fb->fb.screen_base, fb->fb.fix.smem_start,
-               fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  void nspire_clcd_remove(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-               fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c

index e9f65fec55c0b9beacfdf694424e601b4673327e..b6d62e4cdfddaf53f8d3aa7c1768b5e83255673e 100644 (file)
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2200,6 +2200,11 @@ static int _enable(struct omap_hwmod *oh)
   */
  static int _idle(struct omap_hwmod *oh)
  {
+       if (oh->flags & HWMOD_NO_IDLE) {
+               oh->_int_flags |= _HWMOD_SKIP_ENABLE;
+               return 0;
+       }
+
         pr_debug("omap_hwmod: %s: idling\n", oh->name);
  
         if (oh->_state != _HWMOD_STATE_ENABLED) {
@@ -2504,6 +2509,8 @@ static int __init _init(struct omap_hwmod *oh, void *data)
                         oh->flags |= HWMOD_INIT_NO_RESET;
                 if (of_find_property(np, "ti,no-idle-on-init", NULL))
                         oh->flags |= HWMOD_INIT_NO_IDLE;
+               if (of_find_property(np, "ti,no-idle", NULL))
+                       oh->flags |= HWMOD_NO_IDLE;
         }
  
         oh->_state = _HWMOD_STATE_INITIALIZED;
@@ -2630,7 +2637,7 @@ static void __init _setup_postsetup(struct omap_hwmod *oh)
          * XXX HWMOD_INIT_NO_IDLE does not belong in hwmod data -
          * it should be set by the core code as a runtime flag during startup
          */
-       if ((oh->flags & HWMOD_INIT_NO_IDLE) &&
+       if ((oh->flags & (HWMOD_INIT_NO_IDLE | HWMOD_NO_IDLE)) &&
             (postsetup_state == _HWMOD_STATE_IDLE)) {
                 oh->_int_flags |= _HWMOD_SKIP_ENABLE;
                 postsetup_state = _HWMOD_STATE_ENABLED;
diff --git a/arch/arm/mach-omap2/omap_hwmod.h b/arch/arm/mach-omap2/omap_hwmod.h

index 76bce11c85a40c477a5b87aab919d5b8dfded5e8..7c7a31169475b72bc8fbbe2f0dabf015b395f257 100644 (file)
--- a/arch/arm/mach-omap2/omap_hwmod.h
+++ b/arch/arm/mach-omap2/omap_hwmod.h
@@ -525,6 +525,8 @@ struct omap_hwmod_omap4_prcm {
   *     or idled.
   * HWMOD_OPT_CLKS_NEEDED: The optional clocks are needed for the module to
   *     operate and they need to be handled at the same time as the main_clk.
+ * HWMOD_NO_IDLE: Do not idle the hwmod at all. Useful to handle certain
+ *     IPs like CPSW on DRA7, where clocks to this module cannot be disabled.
   */
  #define HWMOD_SWSUP_SIDLE                      (1 << 0)
  #define HWMOD_SWSUP_MSTANDBY                   (1 << 1)
@@ -541,6 +543,7 @@ struct omap_hwmod_omap4_prcm {
  #define HWMOD_SWSUP_SIDLE_ACT                  (1 << 12)
  #define HWMOD_RECONFIG_IO_CHAIN                        (1 << 13)
  #define HWMOD_OPT_CLKS_NEEDED                  (1 << 14)
+#define HWMOD_NO_IDLE                          (1 << 15)
  
  /*
   * omap_hwmod._int_flags definitions
diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c

index 04aff2c31b4607a99265d038d3ba0fd3b9a908b3..70f2f699bed3efe0802def3dc879cdd8e8857a05 100644 (file)
--- a/arch/arm/plat-samsung/pm-check.c
+++ b/arch/arm/plat-samsung/pm-check.c
@@ -53,8 +53,8 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
                 if (ptr->child != NULL)
                         s3c_pm_run_res(ptr->child, fn, arg);
  
-               if ((ptr->flags & IORESOURCE_MEM) &&
-                   strcmp(ptr->name, "System RAM") == 0) {
+               if ((ptr->flags & IORESOURCE_SYSTEM_RAM)
+                               == IORESOURCE_SYSTEM_RAM) {
                         S3C_PMDBG("Found system RAM at %08lx..%08lx\n",
                                   (unsigned long)ptr->start,
                                   (unsigned long)ptr->end);
diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S

index b2b97e3e7babbbb37072b4185faa27a0e47760c8..a62a7b64f49c52706b8e133b1f786dc9dc34842d 100644 (file)
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
  #include <linux/const.h>
  #include <asm/page.h>
  
-       __PAGE_ALIGNED_DATA
-
         .globl vdso_start, vdso_end
+       .section .data..ro_after_init
         .balign PAGE_SIZE
  vdso_start:
         .incbin "arch/arm/vdso/vdso.so"
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h

index 7fc294c3bc5baab31b13b1e23753d17835ce3b27..22dda613f9c91bd3bcfe3a8aad435f7384795401 100644 (file)
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
  int set_memory_x(unsigned long addr, int numpages);
  int set_memory_nx(unsigned long addr, int numpages);
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
  #endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h

index f5060867458024a5a61ccfd4a77330c728e13010..819aff5d593f701d1101d4cc42dbec8cedbc80aa 100644 (file)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,7 +40,7 @@
   * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
   *     fixed mappings and modules
   */
-#define VMEMMAP_SIZE           ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE           ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
  
  #ifndef CONFIG_KASAN
  #define VMALLOC_START          (VA_START)
@@ -52,7 +52,8 @@
  #define VMALLOC_END            (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
  
  #define VMEMMAP_START          (VMALLOC_END + SZ_64K)
-#define vmemmap                        ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
+#define vmemmap                        ((struct page *)VMEMMAP_START - \
+                                SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
  
  #define FIRST_USER_ADDRESS     0UL
  
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c

index 8119479147db147c33800f76aa0d07c6072e8559..450987d99b9b9e497411a79f34781a3cb242df6e 100644 (file)
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -73,13 +73,13 @@ static struct resource mem_res[] = {
                 .name = "Kernel code",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         },
         {
                 .name = "Kernel data",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         }
  };
  
@@ -210,7 +210,7 @@ static void __init request_standard_resources(void)
                 res->name  = "System RAM";
                 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
                 request_resource(&iomem_resource, res);
  
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S

index e33fe33876ab3804f2c6dcd6c5458e576596ef24..fd10eb6638689c1262c34cca7de533453cb383ae 100644 (file)
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -145,6 +145,10 @@ ENTRY(cpu_resume_mmu)
  ENDPROC(cpu_resume_mmu)
         .popsection
  cpu_resume_after_mmu:
+#ifdef CONFIG_KASAN
+       mov     x0, sp
+       bl      kasan_unpoison_remaining_stack
+#endif
         mov     x0, #0                  // return zero on success
         ldp     x19, x20, [sp, #16]
         ldp     x21, x22, [sp, #32]
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c

index 82d607c3614ed8454d19f2da5d44a6cc914ee0b3..da30529bb1f65c9e3d5408b2e28ab31bc2283211 100644 (file)
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt)
                 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
         } else if (ps == PUD_SIZE) {
                 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-       } else if (ps == (PAGE_SIZE * CONT_PTES)) {
-               hugetlb_add_hstate(CONT_PTE_SHIFT);
-       } else if (ps == (PMD_SIZE * CONT_PMDS)) {
-               hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
         } else {
                 pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
                 return 0;
@@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt)
         return 1;
  }
  __setup("hugepagesz=", setup_hugepagesz);
-
-#ifdef CONFIG_ARM64_64K_PAGES
-static __init int add_default_hugepagesz(void)
-{
-       if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-               hugetlb_add_hstate(CONT_PMD_SHIFT);
-       return 0;
-}
-arch_initcall(add_default_hugepagesz);
-#endif
diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c

index 209ae5ad34958044ae3d6addcafa090f890a8be7..e6928896da2a4a2879828df92d940a2ed321a4ea 100644 (file)
--- a/arch/avr32/kernel/setup.c
+++ b/arch/avr32/kernel/setup.c
@@ -49,13 +49,13 @@ static struct resource __initdata kernel_data = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_MEM,
+       .flags  = IORESOURCE_SYSTEM_RAM,
  };
  static struct resource __initdata kernel_code = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_MEM,
+       .flags  = IORESOURCE_SYSTEM_RAM,
         .sibling = &kernel_data,
  };
  
@@ -134,7 +134,7 @@ add_physical_memory(resource_size_t start, resource_size_t end)
         new->start = start;
         new->end = end;
         new->name = "System RAM";
-       new->flags = IORESOURCE_MEM;
+       new->flags = IORESOURCE_SYSTEM_RAM;
  
         *pprev = new;
  }
diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c

index 72e17f7ebd6ff0fba193ac07190d42b41e106e76..786e36e2f61de91c8f446cc941cff895cdb91578 100644 (file)
--- a/arch/c6x/kernel/setup.c
+++ b/arch/c6x/kernel/setup.c
@@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr)
          */
         set_ist(_vectors_start);
  
-       lockdep_init();
-
         /*
          * dtb is passed in from bootloader.
          * fdt is linked in blob.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c

index caae3f4e4341a40d86f4014871c1ebe929a937ae..300dac3702f11a13460d1954843bfcbd1ddc2034 100644 (file)
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1178,7 +1178,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
         efi_memory_desc_t *md;
         u64 efi_desc_size;
         char *name;
-       unsigned long flags;
+       unsigned long flags, desc;
  
         efi_map_start = __va(ia64_boot_param->efi_memmap);
         efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
@@ -1193,6 +1193,8 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                         continue;
  
                 flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               desc = IORES_DESC_NONE;
+
                 switch (md->type) {
  
                         case EFI_MEMORY_MAPPED_IO:
@@ -1207,14 +1209,17 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                                 if (md->attribute & EFI_MEMORY_WP) {
                                         name = "System ROM";
                                         flags |= IORESOURCE_READONLY;
-                               } else if (md->attribute == EFI_MEMORY_UC)
+                               } else if (md->attribute == EFI_MEMORY_UC) {
                                         name = "Uncached RAM";
-                               else
+                               } else {
                                         name = "System RAM";
+                                       flags |= IORESOURCE_SYSRAM;
+                               }
                                 break;
  
                         case EFI_ACPI_MEMORY_NVS:
                                 name = "ACPI Non-volatile Storage";
+                               desc = IORES_DESC_ACPI_NV_STORAGE;
                                 break;
  
                         case EFI_UNUSABLE_MEMORY:
@@ -1224,6 +1229,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
  
                         case EFI_PERSISTENT_MEMORY:
                                 name = "Persistent Memory";
+                               desc = IORES_DESC_PERSISTENT_MEMORY;
                                 break;
  
                         case EFI_RESERVED_TYPE:
@@ -1246,6 +1252,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                 res->start = md->phys_addr;
                 res->end = md->phys_addr + efi_md_size(md) - 1;
                 res->flags = flags;
+               res->desc = desc;
  
                 if (insert_resource(&iomem_resource, res) < 0)
                         kfree(res);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c

index 4f118b0d30915f4f5927719522414a66efe17f08..2029a38a72aeee89793d53ee60418567586d67ff 100644 (file)
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -80,17 +80,17 @@ unsigned long vga_console_membase;
  
  static struct resource data_resource = {
         .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  unsigned long ia64_max_cacheline_size;
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c

index a5ecef7188baa39e6f21d7a5a7c5ed6cca8f040f..136c69f1fb8ab8b73c23e2a752ea371c6e484512 100644 (file)
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -70,14 +70,14 @@ static struct resource data_resource = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  unsigned long memory_start;
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c

index 89a2a93949274b16b3508b314d0cbdf6db5946e6..f31ebb5dc26c21922f240545d038f0d6b5699300 100644 (file)
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
         memset(__bss_start, 0, __bss_stop-__bss_start);
         memset(_ssbss, 0, _esbss-_ssbss);
  
-       lockdep_init();
-
  /* initialize device tree for usage in early_printk */
         early_init_devtree(_fdt_start);
  
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig

index 74a3db92da1b52edc15165ac06d3b1558b8b78c0..d3da79dda629114e936af3e811348171c62a9bcd 100644 (file)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2169,7 +2169,7 @@ config MIPS_MT_SMP
         select CPU_MIPSR2_IRQ_VI
         select CPU_MIPSR2_IRQ_EI
         select SYNC_R4K
-       select MIPS_GIC_IPI
+       select MIPS_GIC_IPI if MIPS_GIC
         select MIPS_MT
         select SMP
         select SMP_UP
@@ -2267,7 +2267,7 @@ config MIPS_VPE_APSP_API_MT
  config MIPS_CMP
         bool "MIPS CMP framework support (DEPRECATED)"
         depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
-       select MIPS_GIC_IPI
+       select MIPS_GIC_IPI if MIPS_GIC
         select SMP
         select SYNC_R4K
         select SYS_SUPPORTS_SMP
@@ -2287,7 +2287,7 @@ config MIPS_CPS
         select MIPS_CM
         select MIPS_CPC
         select MIPS_CPS_PM if HOTPLUG_CPU
-       select MIPS_GIC_IPI
+       select MIPS_GIC_IPI if MIPS_GIC
         select SMP
         select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
         select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2306,6 +2306,7 @@ config MIPS_CPS_PM
         bool
  
  config MIPS_GIC_IPI
+       depends on MIPS_GIC
         bool
  
  config MIPS_CM
diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c

index 408799a839b42f2d9d7d1232bd7ea4efda0cf2d4..f7521142deda55fb439fa0857a2383b4dad7cc83 100644 (file)
--- a/arch/mips/boot/compressed/uart-16550.c
+++ b/arch/mips/boot/compressed/uart-16550.c
@@ -17,7 +17,7 @@
  #define PORT(offset) (CKSEG1ADDR(AR7_REGS_UART0) + (4 * offset))
  #endif
  
-#ifdef CONFIG_MACH_JZ4740
+#if defined(CONFIG_MACH_JZ4740) || defined(CONFIG_MACH_JZ4780)
  #include <asm/mach-jz4740/base.h>
  #define PORT(offset) (CKSEG1ADDR(JZ4740_UART0_BASE_ADDR) + (4 * offset))
  #endif
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c

index 5fdaf8bdcd2ebeed8e31eb7825edd958bfc1c251..4f607341a7938867efc6f0f0646c218bf168421b 100644 (file)
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -732,21 +732,23 @@ static void __init resource_init(void)
                         end = HIGHMEM_START - 1;
  
                 res = alloc_bootmem(sizeof(struct resource));
+
+               res->start = start;
+               res->end = end;
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
                 switch (boot_mem_map.map[i].type) {
                 case BOOT_MEM_RAM:
                 case BOOT_MEM_INIT_RAM:
                 case BOOT_MEM_ROM_DATA:
                         res->name = "System RAM";
+                       res->flags |= IORESOURCE_SYSRAM;
                         break;
                 case BOOT_MEM_RESERVED:
                 default:
                         res->name = "reserved";
                 }
  
-               res->start = start;
-               res->end = end;
-
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                 request_resource(&iomem_resource, res);
  
                 /*
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c

index bd4385a8e6e86f7fbb9ca6d988f5eee155b9a8c7..2b521e07b8601a2c80e888b3064009779effa78b 100644 (file)
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -121,6 +121,7 @@ static inline void calculate_cpu_foreign_map(void)
         cpumask_t temp_foreign_map;
  
         /* Re-calculate the mask */
+       cpumask_clear(&temp_foreign_map);
         for_each_online_cpu(i) {
                 core_present = 0;
                 for_each_cpu(k, &temp_foreign_map)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c

index 3110447ab1e9bffc37fa8572f5ce62d44d4a5fd9..70ef1a43c1148343101f2603a019fcefc1f90bce 100644 (file)
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
  
         dvcpu->arch.wait = 0;
  
-       if (waitqueue_active(&dvcpu->wq))
-               wake_up_interruptible(&dvcpu->wq);
+       if (swait_active(&dvcpu->wq))
+               swake_up(&dvcpu->wq);
  
         return 0;
  }
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
         kvm_mips_callbacks->queue_timer_int(vcpu);
  
         vcpu->arch.wait = 0;
-       if (waitqueue_active(&vcpu->wq))
-               wake_up_interruptible(&vcpu->wq);
+       if (swait_active(&vcpu->wq))
+               swake_up(&vcpu->wq);
  }
  
  /* low level hrtimer wake routine */
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h

index 3d0e17bcc8e905ece06053ae15b3ff5443e6b033..df0f52bd18b457f9efa1a0b9fc470b0803f50632 100644 (file)
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,9 @@
  
  #define __read_mostly __attribute__((__section__(".data..read_mostly")))
  
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init        __read_mostly
+
  void parisc_cache_init(void);  /* initializes cache-flushing */
  void disable_sr_hashing_asm(int); /* low level support for above */
  void disable_sr_hashing(void);   /* turns off space register hashing */
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h

index 845272ce9cc587222e835131bb5ed96be0be03d4..7bd69bd43a018577d099f373346383900b1f0121 100644 (file)
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
         }
  }
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
  #include <asm/kmap_types.h>
  
  #define ARCH_HAS_KMAP
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c

index 1b366c47768722081efe57a9db39eaba2a7dcde9..3c07d6b968772bc6de99bada8d83578ffe172c3d 100644 (file)
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -55,12 +55,12 @@ signed char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
  
  static struct resource data_resource = {
         .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource pdcdata_resource = {
@@ -201,7 +201,7 @@ static void __init setup_bootmem(void)
                 res->name = "System RAM";
                 res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT;
                 res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                 request_resource(&iomem_resource, res);
         }
  
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 9d08d8cbed1a1ec0e5893679c5e1fd9cb69dce09..c98afa538b3aeca91901e858c02884e820ca3dfa 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvmppc_vcore {
         struct list_head runnable_threads;
         struct list_head preempt_list;
         spinlock_t lock;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
         u64 stolen_tb;
         u64 preempt_tb;
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
         u8 prodded;
         u32 last_inst;
  
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
         struct kvmppc_vcore *vcore;
         int ret;
         int trap;
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c

index ad8c9db61237223c3b807e3a9afed1487c44af1e..d544fa31175766bf48790f75bcd5458d6a505080 100644 (file)
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
  
  notrace void __init machine_init(u64 dt_ptr)
  {
-       lockdep_init();
-
         /* Enable early debugging if any specified (see udbg.h) */
         udbg_early_init();
  
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c

index 5c03a6a9b0542fac3d042f2481f367aae9178d38..f98be8383a39994868ba41d0f8492e99f48eb1fe 100644 (file)
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr)
         setup_paca(&boot_paca);
         fixup_boot_paca();
  
-       /* Initialize lockdep early or else spinlocks will blow */
-       lockdep_init();
-
         /* -------- printk is now safe to use ------- */
  
         /* Enable early debugging if any specified (see udbg.h) */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index baeddb06811d738a6ea8be4ce920e5ea39a9645a..f1187bb6dd4d7f5960e57aea111bd1c12021408d 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  {
         int cpu;
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
  
         wqp = kvm_arch_vcpu_wq(vcpu);
-       if (waitqueue_active(wqp)) {
-               wake_up_interruptible(wqp);
+       if (swait_active(wqp)) {
+               swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
  
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                 tvcpu->arch.prodded = 1;
                 smp_mb();
                 if (vcpu->arch.ceded) {
-                       if (waitqueue_active(&vcpu->wq)) {
-                               wake_up_interruptible(&vcpu->wq);
+                       if (swait_active(&vcpu->wq)) {
+                               swake_up(&vcpu->wq);
                                 vcpu->stat.halt_wakeup++;
                         }
                 }
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
         INIT_LIST_HEAD(&vcore->runnable_threads);
         spin_lock_init(&vcore->lock);
         spin_lock_init(&vcore->stoltb_lock);
-       init_waitqueue_head(&vcore->wq);
+       init_swait_queue_head(&vcore->wq);
         vcore->preempt_tb = TB_NIL;
         vcore->lpcr = kvm->arch.lpcr;
         vcore->first_vcpuid = core * threads_per_subcore;
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
  {
         struct kvm_vcpu *vcpu;
         int do_sleep = 1;
+       DECLARE_SWAITQUEUE(wait);
  
-       DEFINE_WAIT(wait);
-
-       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  
         /*
          * Check one last time for pending exceptions and ceded state after
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         }
  
         if (!do_sleep) {
-               finish_wait(&vc->wq, &wait);
+               finish_swait(&vc->wq, &wait);
                 return;
         }
  
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         trace_kvmppc_vcore_blocked(vc, 0);
         spin_unlock(&vc->lock);
         schedule();
-       finish_wait(&vc->wq, &wait);
+       finish_swait(&vc->wq, &wait);
         spin_lock(&vc->lock);
         vc->vcore_state = VCORE_INACTIVE;
         trace_kvmppc_vcore_blocked(vc, 1);
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                         kvmppc_start_thread(vcpu, vc);
                         trace_kvm_guest_enter(vcpu);
                 } else if (vc->vcore_state == VCORE_SLEEPING) {
-                       wake_up(&vc->wq);
+                       swake_up(&vc->wq);
                 }
  
         }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 6ee26de9a1ded02e0e08f376fd612049eedee0b4..25ae2c9913c39c2fae34fb60dd7ee655ba821b6e 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1370,6 +1370,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         std     r6, VCPU_ACOP(r9)
         stw     r7, VCPU_GUEST_PID(r9)
         std     r8, VCPU_WORT(r9)
+       /*
+        * Restore various registers to 0, where non-zero values
+        * set by the guest could disrupt the host.
+        */
+       li      r0, 0
+       mtspr   SPRN_IAMR, r0
+       mtspr   SPRN_CIABR, r0
+       mtspr   SPRN_DAWRX, r0
+       mtspr   SPRN_TCSCR, r0
+       mtspr   SPRN_WORT, r0
+       /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+       li      r0, 1
+       sldi    r0, r0, 31
+       mtspr   SPRN_MMCRS, r0
  8:
  
         /* Save and reset AMR and UAMOR before turning on the MMU */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c

index d0f0a514b04ed66da35d1ae6b27b964285ef2aec..f078a1f94fc267de0e4de530cd413608c2d93e5f 100644 (file)
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -541,7 +541,7 @@ static int __init add_system_ram_resources(void)
                         res->name = "System RAM";
                         res->start = base;
                         res->end = base + size - 1;
-                       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                         WARN_ON(request_resource(&iomem_resource, res) < 0);
                 }
         }
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index 8959ebb6d2c9eb35538b835a39522585321f9af4..b0c8ad0799c7f0c09607420441ea87735c88c6d9 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
  struct kvm_s390_local_interrupt {
         spinlock_t lock;
         struct kvm_s390_float_interrupt *float_int;
-       wait_queue_head_t *wq;
+       struct swait_queue_head *wq;
         atomic_t *cpuflags;
         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
         struct kvm_s390_irq_payload irq;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h

index fb1b93ea3e3fead0efde9f30e819c55eecb88da9..e485817f7b1a97683e64ab04d1f7444e1aae2e1d 100644 (file)
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,17 +15,25 @@
  static inline int init_new_context(struct task_struct *tsk,
                                    struct mm_struct *mm)
  {
+       spin_lock_init(&mm->context.list_lock);
+       INIT_LIST_HEAD(&mm->context.pgtable_list);
+       INIT_LIST_HEAD(&mm->context.gmap_list);
         cpumask_clear(&mm->context.cpu_attach_mask);
         atomic_set(&mm->context.attach_count, 0);
         mm->context.flush_mm = 0;
-       mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
-       mm->context.asce_bits |= _ASCE_TYPE_REGION3;
  #ifdef CONFIG_PGSTE
         mm->context.alloc_pgste = page_table_allocate_pgste;
         mm->context.has_pgste = 0;
         mm->context.use_skey = 0;
  #endif
-       mm->context.asce_limit = STACK_TOP_MAX;
+       if (mm->context.asce_limit == 0) {
+               /* context created by exec, set asce limit to 4TB */
+               mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+                       _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+               mm->context.asce_limit = STACK_TOP_MAX;
+       } else if (mm->context.asce_limit == (1UL << 31)) {
+               mm_inc_nr_pmds(mm);
+       }
         crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
         return 0;
  }
@@ -111,8 +119,6 @@ static inline void activate_mm(struct mm_struct *prev,
  static inline void arch_dup_mmap(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
  {
-       if (oldmm->context.asce_limit < mm->context.asce_limit)
-               crst_table_downgrade(mm, oldmm->context.asce_limit);
  }
  
  static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h

index 7b7858f158b4574b549ab99648f977a3e249068c..d7cc79fb6191117a0ee0646ae6bbdf3a92a58132 100644 (file)
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -100,12 +100,26 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
  
  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
  {
-       spin_lock_init(&mm->context.list_lock);
-       INIT_LIST_HEAD(&mm->context.pgtable_list);
-       INIT_LIST_HEAD(&mm->context.gmap_list);
-       return (pgd_t *) crst_table_alloc(mm);
+       unsigned long *table = crst_table_alloc(mm);
+
+       if (!table)
+               return NULL;
+       if (mm->context.asce_limit == (1UL << 31)) {
+               /* Forking a compat process with 2 page table levels */
+               if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+                       crst_table_free(mm, table);
+                       return NULL;
+               }
+       }
+       return (pgd_t *) table;
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       if (mm->context.asce_limit == (1UL << 31))
+               pgtable_pmd_page_dtor(virt_to_page(pgd));
+       crst_table_free(mm, (unsigned long *) pgd);
  }
-#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
  
  static inline void pmd_populate(struct mm_struct *mm,
                                 pmd_t *pmd, pgtable_t pte)
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c

index c55576bbaa1f7555479994d6848c9aa5b60f189b..a0684de5a93b99ae199f9bf6906327f4513f662e 100644 (file)
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -448,7 +448,6 @@ void __init startup_init(void)
         rescue_initrd();
         clear_bss_section();
         init_kernel_storage_key();
-       lockdep_init();
         lockdep_off();
         setup_lowcore_early();
         setup_facility_list();
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S

index c5febe84eba633385cd7a8c09bee87e484c2e05c..03c2b469c4725172e4c6750cb8bcb0fb4edea075 100644 (file)
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -16,7 +16,7 @@
  
  __HEAD
  ENTRY(startup_continue)
-       tm      __LC_STFLE_FAC_LIST+6,0x80      # LPP available ?
+       tm      __LC_STFLE_FAC_LIST+5,0x80      # LPP available ?
         jz      0f
         xc      __LC_LPP+1(7,0),__LC_LPP+1      # clear lpp and current_pid
         mvi     __LC_LPP,0x80                   #   and set LPP_MAGIC
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c

index 9220db5c996aa5250ca4d904361c51cec33d7827..cedb0198675f7577c7237bb61c006acaf0e984d4 100644 (file)
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -374,17 +374,17 @@ static void __init setup_lowcore(void)
  
  static struct resource code_resource = {
         .name  = "Kernel code",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource data_resource = {
         .name = "Kernel data",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource bss_resource = {
         .name = "Kernel bss",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource __initdata *standard_resources[] = {
@@ -408,7 +408,7 @@ static void __init setup_resources(void)
  
         for_each_memblock(memory, reg) {
                 res = alloc_bootmem_low(sizeof(*res));
-               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
  
                 res->name = "System RAM";
                 res->start = reg->base;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index f88ca72c3a77a52e05e65cfa79206cd8e41aa786..9ffc7322179213f031939fa184bc6c93545af559 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -966,13 +966,13 @@ no_timer:
  
  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  {
-       if (waitqueue_active(&vcpu->wq)) {
+       if (swait_active(&vcpu->wq)) {
                 /*
                  * The vcpu gave up the cpu voluntarily, mark it as a good
                  * yield-candidate.
                  */
                 vcpu->preempted = true;
-               wake_up_interruptible(&vcpu->wq);
+               swake_up(&vcpu->wq);
                 vcpu->stat.halt_wakeup++;
         }
  }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 4af21c771f9b3925bf860d0dc86cebcc803cf965..03dfe9c667f4eb944705787e54ff7e6ac3c08afb 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2381,7 +2381,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
  
         /* manually convert vector registers if necessary */
         if (MACHINE_HAS_VX) {
-               convert_vx_to_fp(fprs, current->thread.fpu.vxrs);
+               convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
                 rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
                                      fprs, 128);
         } else {
diff --git a/arch/score/kernel/setup.c b/arch/score/kernel/setup.c

index b48459afefddb835582e548f5b8d7b4ffd026e0d..f3a0649ab5217ac5f87b83580723dfda40fff3de 100644 (file)
--- a/arch/score/kernel/setup.c
+++ b/arch/score/kernel/setup.c
@@ -101,7 +101,7 @@ static void __init resource_init(void)
         res->name = "System RAM";
         res->start = MEMORY_START;
         res->end = MEMORY_START + MEMORY_SIZE - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         request_resource(&iomem_resource, res);
  
         request_resource(res, &code_resource);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c

index de19cfa768f208708406e3350b5e9aecc92a6266..3f1c18b28e8af5fc414c13ebeb29ccbf04cd63a5 100644 (file)
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -78,17 +78,17 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0, };
  
  static struct resource code_resource = {
         .name = "Kernel code",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource data_resource = {
         .name = "Kernel data",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  unsigned long memory_start;
@@ -202,7 +202,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
         res->name = "System RAM";
         res->start = start;
         res->end = end - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
         if (request_resource(&iomem_resource, res)) {
                 pr_err("unable to request memory_resource 0x%lx 0x%lx\n",
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S

index f2d30cab5b3f388fa9b446cf5b12afda92fa0a9d..cd1f592cd3479f8c94c599bfc589087184d4d180 100644 (file)
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -696,14 +696,6 @@ tlb_fixup_done:
         call    __bzero
          sub    %o1, %o0, %o1
  
-#ifdef CONFIG_LOCKDEP
-       /* We have this call this super early, as even prom_init can grab
-        * spinlocks and thus call into the lockdep code.
-        */
-       call    lockdep_init
-        nop
-#endif
-
         call    prom_init
          mov    %l7, %o0                        ! OpenPROM cif handler
  
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c

index 6f216853f2724f8395bdd36ab18c4eadbeae0366..1cfe6aab7a11572d54f6848fe4656b7b40af8cf2 100644 (file)
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2863,17 +2863,17 @@ void hugetlb_setup(struct pt_regs *regs)
  
  static struct resource code_resource = {
         .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource data_resource = {
         .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static inline resource_size_t compute_kern_paddr(void *addr)
@@ -2909,7 +2909,7 @@ static int __init report_memory(void)
                 res->name = "System RAM";
                 res->start = pavail[i].phys_addr;
                 res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
-               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
  
                 if (insert_resource(&iomem_resource, res) < 0) {
                         pr_warn("Resource insertion failed.\n");
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c

index bbb855de6569841200c991bf610b090c4e94c47b..a992238e9b58260fdfa067c4cc2c0d348ceda41f 100644 (file)
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1632,14 +1632,14 @@ static struct resource data_resource = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  /*
@@ -1673,10 +1673,15 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
                 kzalloc(sizeof(struct resource), GFP_ATOMIC);
         if (!res)
                 return NULL;
-       res->name = reserved ? "Reserved" : "System RAM";
         res->start = start_pfn << PAGE_SHIFT;
         res->end = (end_pfn << PAGE_SHIFT) - 1;
         res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       if (reserved) {
+               res->name = "Reserved";
+       } else {
+               res->name = "System RAM";
+               res->flags |= IORESOURCE_SYSRAM;
+       }
         if (insert_resource(&iomem_resource, res)) {
                 kfree(res);
                 return NULL;
diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c

index 3fa317f96122130d30cfc7be41d4e1450fc56a02..c2bffa5614a48102a1d66a60865a2e3474a49597 100644 (file)
--- a/arch/unicore32/kernel/setup.c
+++ b/arch/unicore32/kernel/setup.c
@@ -72,13 +72,13 @@ static struct resource mem_res[] = {
                 .name = "Kernel code",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         },
         {
                 .name = "Kernel data",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         }
  };
  
@@ -211,7 +211,7 @@ request_standard_resources(struct meminfo *mi)
                 res->name  = "System RAM";
                 res->start = mi->bank[i].start;
                 res->end   = mi->bank[i].start + mi->bank[i].size - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
                 request_resource(&iomem_resource, res);
  
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild

index 1538562cc720e78132d0eb31c38116b029d56ce0..eb3abf8ac44eb33f333ed29727dfd49ba6bfbf38 100644 (file)
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,6 +1,7 @@
-
  obj-y += entry/
  
+obj-$(CONFIG_PERF_EVENTS) += events/
+
  obj-$(CONFIG_KVM) += kvm/
  
  # Xen paravirtualization support
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index c46662f64c392050d1c45b722e3de7c01f87e250..b1051057e5b014b6a2b11b5c2cdbef866633e0c0 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -303,6 +303,9 @@ config ARCH_SUPPORTS_UPROBES
  config FIX_EARLYCON_MEM
         def_bool y
  
+config DEBUG_RODATA
+       def_bool y
+
  config PGTABLE_LEVELS
         int
         default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index 9b18ed97a8a2968b7b293bc83484d8311d830e83..67eec55093a5dc97634aea899c4be86ff9f18f00 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,28 +74,16 @@ config EFI_PGT_DUMP
           issues with the mapping of the EFI runtime regions into that
           table.
  
-config DEBUG_RODATA
-       bool "Write protect kernel read-only data structures"
-       default y
-       depends on DEBUG_KERNEL
-       ---help---
-         Mark the kernel read-only data as write-protected in the pagetables,
-         in order to catch accidental (and incorrect) writes to such const
-         data. This is recommended so that we can catch kernel bugs sooner.
-         If in doubt, say "Y".
-
  config DEBUG_RODATA_TEST
-       bool "Testcase for the DEBUG_RODATA feature"
-       depends on DEBUG_RODATA
+       bool "Testcase for the marking rodata read-only"
         default y
         ---help---
-         This option enables a testcase for the DEBUG_RODATA
-         feature as well as for the change_page_attr() infrastructure.
+         This option enables a testcase for the setting rodata read-only
+         as well as for the change_page_attr() infrastructure.
           If in doubt, say "N"
  
  config DEBUG_WX
         bool "Warn on W+X mappings at boot"
-       depends on DEBUG_RODATA
         select X86_PTDUMP_CORE
         ---help---
           Generate a warning if any W+X mappings are found at boot.
@@ -350,16 +338,6 @@ config DEBUG_IMR_SELFTEST
  
           If unsure say N here.
  
-config X86_DEBUG_STATIC_CPU_HAS
-       bool "Debug alternatives"
-       depends on DEBUG_KERNEL
-       ---help---
-         This option causes additional code to be generated which
-         fails if static_cpu_has() is used before alternatives have
-         run.
-
-         If unsure, say N.
-
  config X86_DEBUG_FPU
         bool "Debug the x86 FPU code"
         depends on DEBUG_KERNEL
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h

index ea97697e51e40fd2ff232caa5e3147ff0f6e725c..4cb404fd45ceaa0e89f9e669d6a59de4ce378449 100644 (file)
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -1,7 +1,7 @@
  #ifndef BOOT_CPUFLAGS_H
  #define BOOT_CPUFLAGS_H
  
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/processor-flags.h>
  
  struct cpu_features {
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c

index 637097e66a62a675de99040bdcf4f8fc011e2af0..f72498dc90d2376f5f0ffafbc4305be959344343 100644 (file)
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -17,7 +17,7 @@
  
  #include "../include/asm/required-features.h"
  #include "../include/asm/disabled-features.h"
-#include "../include/asm/cpufeature.h"
+#include "../include/asm/cpufeatures.h"
  #include "../kernel/cpu/capflags.c"
  
  int main(void)
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c

index a7661c430cd98d28795ddee61bf5506c5ef3a1e5..0702d2531bc7f441364adca7ebc2d0e1a227f0bd 100644 (file)
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -49,7 +49,6 @@ typedef unsigned int   u32;
  
  /* This must be large enough to hold the entire setup */
  u8 buf[SETUP_SECT_MAX*512];
-int is_big_kernel;
  
  #define PECOFF_RELOC_RESERVE 0x20
  
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c

index 07d2c6c86a5483216684970489fcba3b4478007d..27226df3f7d8aa16a647a051d8ac19ef02ef5aa2 100644 (file)
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -33,7 +33,7 @@
  #include <linux/crc32.h>
  #include <crypto/internal/hash.h>
  
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/cpu_device_id.h>
  #include <asm/fpu/api.h>
  
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c

index 0e9871693f2469d3106f57ab6dc94d85a3f16146..0857b1a1de3bdc411b3fea94517a9db6c9b9952c 100644 (file)
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -30,7 +30,7 @@
  #include <linux/kernel.h>
  #include <crypto/internal/hash.h>
  
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/cpu_device_id.h>
  #include <asm/fpu/internal.h>
  
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c

index a3fcfc97a311d5b660e0dcda3be489038f231dce..cd4df93225014b6fc64e3020bf07c0600647fd7c 100644 (file)
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -30,7 +30,7 @@
  #include <linux/string.h>
  #include <linux/kernel.h>
  #include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/cpu_device_id.h>
  
  asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h

index e32206e0986828390e769604ec6cd510b1f2296e..9a9e5884066c6581878b56cde27e75f542b21249 100644 (file)
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
         .byte 0xf1
         .endm
  
-#else /* CONFIG_X86_64 */
-
-/*
- * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
- * are different from the entry_32.S versions in not changing the segment
- * registers. So only suitable for in kernel use, not when transitioning
- * from or to user space. The resulting stack frame is not a standard
- * pt_regs frame. The main use case is calling C code from assembler
- * when all the registers need to be preserved.
- */
-
-       .macro SAVE_ALL
-       pushl %eax
-       pushl %ebp
-       pushl %edi
-       pushl %esi
-       pushl %edx
-       pushl %ecx
-       pushl %ebx
-       .endm
-
-       .macro RESTORE_ALL
-       popl %ebx
-       popl %ecx
-       popl %edx
-       popl %esi
-       popl %edi
-       popl %ebp
-       popl %eax
-       .endm
-
  #endif /* CONFIG_X86_64 */
  
  /*
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c

index 03663740c86655cabf21504578e97d73d98595be..e79d93d44ecd9c66b1e29078a11aa1ee405c0fa2 100644 (file)
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
  #include <asm/traps.h>
  #include <asm/vdso.h>
  #include <asm/uaccess.h>
+#include <asm/cpufeature.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/syscalls.h>
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void)
         CT_WARN_ON(ct_state() != CONTEXT_USER);
         user_exit();
  }
+#else
+static inline void enter_from_user_mode(void) {}
  #endif
  
  static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
  
         work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
  
-#ifdef CONFIG_CONTEXT_TRACKING
-       /*
-        * If TIF_NOHZ is set, we are required to call user_exit() before
-        * doing anything that could touch RCU.
-        */
-       if (work & _TIF_NOHZ) {
-               enter_from_user_mode();
-               work &= ~_TIF_NOHZ;
-       }
-#endif
-
  #ifdef CONFIG_SECCOMP
         /*
          * Do seccomp first -- it should minimize exposure of other
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
         if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
                 BUG_ON(regs != task_pt_regs(current));
  
-       /*
-        * If we stepped into a sysenter/syscall insn, it trapped in
-        * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-        * If user-mode had set TF itself, then it's still clear from
-        * do_debug() and we need to set it again to restore the user
-        * state.  If we entered on the slow path, TF was already set.
-        */
-       if (work & _TIF_SINGLESTEP)
-               regs->flags |= X86_EFLAGS_TF;
-
  #ifdef CONFIG_SECCOMP
         /*
          * Call seccomp_phase2 before running the other hooks so that
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
  /* Called with IRQs disabled. */
  __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
  {
+       struct thread_info *ti = pt_regs_to_thread_info(regs);
         u32 cached_flags;
  
         if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
  
         lockdep_sys_exit();
  
-       cached_flags =
-               READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+       cached_flags = READ_ONCE(ti->flags);
  
         if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
                 exit_to_usermode_loop(regs, cached_flags);
  
+#ifdef CONFIG_COMPAT
+       /*
+        * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+        * returning to user mode.  We need to clear it *after* signal
+        * handling, because syscall restart has a fixup for compat
+        * syscalls.  The fixup is exercised by the ptrace_syscall_32
+        * selftest.
+        */
+       ti->status &= ~TS_COMPAT;
+#endif
+
         user_enter();
  }
  
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
         if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
                 syscall_slow_exit_work(regs, cached_flags);
  
-#ifdef CONFIG_COMPAT
+       local_irq_disable();
+       prepare_exit_to_usermode(regs);
+}
+
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       unsigned long nr = regs->orig_ax;
+
+       enter_from_user_mode();
+       local_irq_enable();
+
+       if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+               nr = syscall_trace_enter(regs);
+
         /*
-        * Compat syscalls set TS_COMPAT.  Make sure we clear it before
-        * returning to user mode.
+        * NB: Native and x32 syscalls are dispatched from the same
+        * table.  The only functional difference is the x32 bit in
+        * regs->orig_ax, which changes the behavior of some syscalls.
          */
-       ti->status &= ~TS_COMPAT;
-#endif
+       if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+               regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+                       regs->di, regs->si, regs->dx,
+                       regs->r10, regs->r8, regs->r9);
+       }
  
-       local_irq_disable();
-       prepare_exit_to_usermode(regs);
+       syscall_return_slowpath(regs);
  }
+#endif
  
  #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
  /*
- * Does a 32-bit syscall.  Called with IRQs on and does all entry and
- * exit work and returns with IRQs off.  This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
+ * all entry and exit work and returns with IRQs off.  This function is
+ * extremely hot in workloads that use it, and it's usually called from
   * do_fast_syscall_32, so forcibly inline it to improve performance.
   */
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
  {
         struct thread_info *ti = pt_regs_to_thread_info(regs);
         unsigned int nr = (unsigned int)regs->orig_ax;
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
         syscall_return_slowpath(regs);
  }
  
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
  {
+       enter_from_user_mode();
         local_irq_enable();
         do_syscall_32_irqs_on(regs);
  }
-#endif
  
  /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
  __visible long do_fast_syscall_32(struct pt_regs *regs)
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
          */
         regs->ip = landing_pad;
  
-       /*
-        * Fetch EBP from where the vDSO stashed it.
-        *
-        * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
-        */
+       enter_from_user_mode();
+
         local_irq_enable();
+
+       /* Fetch EBP from where the vDSO stashed it. */
         if (
  #ifdef CONFIG_X86_64
                 /*
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
                 /* User code screwed up. */
                 local_irq_disable();
                 regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
-               enter_from_user_mode();
-#endif
                 prepare_exit_to_usermode(regs);
                 return 0;       /* Keep it simple: use IRET. */
         }
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S

index bb3e376d0f33c8dda5a9cf7b4e2b3f2d39f6389a..10868aa734dc07e9a438bcdd3d4c7b4aae0106d3 100644 (file)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
  #include <asm/processor-flags.h>
  #include <asm/ftrace.h>
  #include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  #include <asm/asm.h>
  #include <asm/smap.h>
@@ -287,20 +287,93 @@ need_resched:
  END(resume_kernel)
  #endif
  
-       # SYSENTER  call handler stub
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!).  To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+       addl    $5*4, %esp                      /* remove xen-provided frame */
+       jmp     sysenter_past_esp
+#endif
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available.  This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
  ENTRY(entry_SYSENTER_32)
         movl    TSS_sysenter_sp0(%esp), %esp
  sysenter_past_esp:
         pushl   $__USER_DS              /* pt_regs->ss */
         pushl   %ebp                    /* pt_regs->sp (stashed in bp) */
         pushfl                          /* pt_regs->flags (except IF = 0) */
-       ASM_CLAC                        /* Clear AC after saving FLAGS */
         orl     $X86_EFLAGS_IF, (%esp)  /* Fix IF */
         pushl   $__USER_CS              /* pt_regs->cs */
         pushl   $0                      /* pt_regs->ip = 0 (placeholder) */
         pushl   %eax                    /* pt_regs->orig_ax */
         SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */
  
+       /*
+        * SYSENTER doesn't filter flags, so we need to clear NT, AC
+        * and TF ourselves.  To save a few cycles, we can check whether
+        * either was set instead of doing an unconditional popfq.
+        * This needs to happen before enabling interrupts so that
+        * we don't get preempted with NT set.
+        *
+        * If TF is set, we will single-step all the way to here -- do_debug
+        * will ignore all the traps.  (Yes, this is slow, but so is
+        * single-stepping in general.  This allows us to avoid having
+        * a more complicated code to handle the case where a user program
+        * forces us to single-step through the SYSENTER entry code.)
+        *
+        * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+        * out-of-line as an optimization: NT is unlikely to be set in the
+        * majority of the cases and instead of polluting the I$ unnecessarily,
+        * we're keeping that code behind a branch which will predict as
+        * not-taken and therefore its instructions won't be fetched.
+        */
+       testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+       jnz     .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
         /*
          * User mode is traced as though IRQs are on, and SYSENTER
          * turned them off.
@@ -326,6 +399,15 @@ sysenter_past_esp:
         popl    %ebp                    /* pt_regs->bp */
         popl    %eax                    /* pt_regs->ax */
  
+       /*
+        * Restore all flags except IF. (We restore IF separately because
+        * STI gives a one-instruction window in which we won't be interrupted,
+        * whereas POPF does not.)
+        */
+       addl    $PT_EFLAGS-PT_DS, %esp  /* point esp at pt_regs->flags */
+       btr     $X86_EFLAGS_IF_BIT, (%esp)
+       popfl
+
         /*
          * Return back to the vDSO, which will pop ecx and edx.
          * Don't bother with DS and ES (they already contain __USER_DS).
@@ -339,28 +421,63 @@ sysenter_past_esp:
  .popsection
         _ASM_EXTABLE(1b, 2b)
         PTGS_TO_GS_EX
+
+.Lsysenter_fix_flags:
+       pushl   $X86_EFLAGS_FIXED
+       popfl
+       jmp     .Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
  ENDPROC(entry_SYSENTER_32)
  
-       # system call handler stub
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries.  It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call.  (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6
+ */
  ENTRY(entry_INT80_32)
         ASM_CLAC
         pushl   %eax                    /* pt_regs->orig_ax */
         SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */
  
         /*
-        * User mode is traced as though IRQs are on.  Unlike the 64-bit
-        * case, INT80 is a trap gate on 32-bit kernels, so interrupts
-        * are already on (unless user code is messing around with iopl).
+        * User mode is traced as though IRQs are on, and the interrupt gate
+        * turned them off.
          */
+       TRACE_IRQS_OFF
  
         movl    %esp, %eax
-       call    do_syscall_32_irqs_on
+       call    do_int80_syscall_32
  .Lsyscall_32_done:
  
  restore_all:
         TRACE_IRQS_IRET
  restore_all_notrace:
  #ifdef CONFIG_X86_ESPFIX32
+       ALTERNATIVE     "jmp restore_nocheck", "", X86_BUG_ESPFIX
+
         movl    PT_EFLAGS(%esp), %eax           # mix EFLAGS, SS and CS
         /*
          * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -387,19 +504,6 @@ ENTRY(iret_exc     )
  
  #ifdef CONFIG_X86_ESPFIX32
  ldt_ss:
-#ifdef CONFIG_PARAVIRT
-       /*
-        * The kernel can't run on a non-flat stack if paravirt mode
-        * is active.  Rather than try to fixup the high bits of
-        * ESP, bypass this code entirely.  This may break DOSemu
-        * and/or Wine support in a paravirt VM, although the option
-        * is still available to implement the setting of the high
-        * 16-bits in the INTERRUPT_RETURN paravirt-op.
-        */
-       cmpl    $0, pv_info+PARAVIRT_enabled
-       jne     restore_nocheck
-#endif
-
  /*
   * Setup and switch to ESPFIX stack
   *
@@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
  END(spurious_interrupt_bug)
  
  #ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
-       addl    $5*4, %esp                      /* remove xen-provided frame */
-       jmp     sysenter_past_esp
-
  ENTRY(xen_hypervisor_callback)
         pushl   $-1                             /* orig_ax = -1 => not a system call */
         SAVE_ALL
@@ -939,51 +1035,48 @@ error_code:
         jmp     ret_from_exception
  END(page_fault)
  
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-       cmpw    $__KERNEL_CS, 4(%esp)
-       jne     \ok
-\label:
-       movl    TSS_sysenter_sp0 + \offset(%esp), %esp
-       pushfl
-       pushl   $__KERNEL_CS
-       pushl   $sysenter_past_esp
-.endm
-
  ENTRY(debug)
+       /*
+        * #DB can happen at the first instruction of
+        * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+        * happens, then we will be running on a very small stack.  We
+        * need to detect this condition and switch to the thread
+        * stack before calling any C code at all.
+        *
+        * If you edit this code, keep in mind that NMIs can happen in here.
+        */
         ASM_CLAC
-       cmpl    $entry_SYSENTER_32, (%esp)
-       jne     debug_stack_correct
-       FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
         pushl   $-1                             # mark this as an int
         SAVE_ALL
-       TRACE_IRQS_OFF
         xorl    %edx, %edx                      # error code 0
         movl    %esp, %eax                      # pt_regs pointer
+
+       /* Are we currently on the SYSENTER stack? */
+       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+       subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
+       cmpl    $SIZEOF_SYSENTER_stack, %ecx
+       jb      .Ldebug_from_sysenter_stack
+
+       TRACE_IRQS_OFF
+       call    do_debug
+       jmp     ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+       /* We're on the SYSENTER stack.  Switch off. */
+       movl    %esp, %ebp
+       movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
+       TRACE_IRQS_OFF
         call    do_debug
+       movl    %ebp, %esp
         jmp     ret_from_exception
  END(debug)
  
  /*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
+ * NMI is doubly nasty.  It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks.  We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
   */
  ENTRY(nmi)
         ASM_CLAC
@@ -994,41 +1087,32 @@ ENTRY(nmi)
         popl    %eax
         je      nmi_espfix_stack
  #endif
-       cmpl    $entry_SYSENTER_32, (%esp)
-       je      nmi_stack_fixup
-       pushl   %eax
-       movl    %esp, %eax
-       /*
-        * Do not access memory above the end of our stack page,
-        * it might not exist.
-        */
-       andl    $(THREAD_SIZE-1), %eax
-       cmpl    $(THREAD_SIZE-20), %eax
-       popl    %eax
-       jae     nmi_stack_correct
-       cmpl    $entry_SYSENTER_32, 12(%esp)
-       je      nmi_debug_stack_check
-nmi_stack_correct:
-       pushl   %eax
+
+       pushl   %eax                            # pt_regs->orig_ax
         SAVE_ALL
         xorl    %edx, %edx                      # zero error code
         movl    %esp, %eax                      # pt_regs pointer
+
+       /* Are we currently on the SYSENTER stack? */
+       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+       subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
+       cmpl    $SIZEOF_SYSENTER_stack, %ecx
+       jb      .Lnmi_from_sysenter_stack
+
+       /* Not on SYSENTER stack. */
         call    do_nmi
         jmp     restore_all_notrace
  
-nmi_stack_fixup:
-       FIX_STACK 12, nmi_stack_correct, 1
-       jmp     nmi_stack_correct
-
-nmi_debug_stack_check:
-       cmpw    $__KERNEL_CS, 16(%esp)
-       jne     nmi_stack_correct
-       cmpl    $debug, (%esp)
-       jb      nmi_stack_correct
-       cmpl    $debug_esp_fix_insn, (%esp)
-       ja      nmi_stack_correct
-       FIX_STACK 24, nmi_stack_correct, 1
-       jmp     nmi_stack_correct
+.Lnmi_from_sysenter_stack:
+       /*
+        * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
+        * is using the thread stack right now, so it's safe for us to use it.
+        */
+       movl    %esp, %ebp
+       movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
+       call    do_nmi
+       movl    %ebp, %esp
+       jmp     restore_all_notrace
  
  #ifdef CONFIG_X86_ESPFIX32
  nmi_espfix_stack:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

index 9d34d3cfceb61c1073c507f5a0f90e939eb146a3..858b555e274b8d763d97d9b9cf14998125bce563 100644 (file)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
  /*
   * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
   *
+ * This is the only entry point used for 64-bit system calls.  The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries.  There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
   * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
   * then loads new ss, cs, and rip from previously programmed MSRs.
   * rflags gets masked by a value from another MSR (so CLD and CLAC
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
         movq    %rsp, PER_CPU_VAR(rsp_scratch)
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  
+       TRACE_IRQS_OFF
+
         /* Construct struct pt_regs on stack */
         pushq   $__USER_DS                      /* pt_regs->ss */
         pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-       /*
-        * Re-enable interrupts.
-        * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-        * must execute atomically in the face of possible interrupt-driven
-        * task preemption. We must enable interrupts only after we're done
-        * with using rsp_scratch:
-        */
-       ENABLE_INTERRUPTS(CLBR_NONE)
         pushq   %r11                            /* pt_regs->flags */
         pushq   $__USER_CS                      /* pt_regs->cs */
         pushq   %rcx                            /* pt_regs->ip */
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
         pushq   %r11                            /* pt_regs->r11 */
         sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
  
-       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-       jnz     tracesys
+       /*
+        * If we need to do entry work or if we guess we'll need to do
+        * exit work, go straight to the slow path.
+        */
+       testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+       jnz     entry_SYSCALL64_slow_path
+
  entry_SYSCALL_64_fastpath:
+       /*
+        * Easy case: enable interrupts and issue the syscall.  If the syscall
+        * needs pt_regs, we'll call a stub that disables interrupts again
+        * and jumps to the slow path.
+        */
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
  #if __SYSCALL_MASK == ~0
         cmpq    $__NR_syscall_max, %rax
  #else
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath:
  #endif
         ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
         movq    %r10, %rcx
+
+       /*
+        * This call instruction is handled specially in stub_ptregs_64.
+        * It might end up jumping to the slow path.  If it jumps, RAX
+        * and all argument registers are clobbered.
+        */
         call    *sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
+
         movq    %rax, RAX(%rsp)
  1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-       LOCKDEP_SYS_EXIT
-       /*
-        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-        * it is too small to ever cause noticeable irq latency.
-        */
-       DISABLE_INTERRUPTS(CLBR_NONE)
  
         /*
-        * We must check ti flags with interrupts (or at least preemption)
-        * off because we must *never* return to userspace without
-        * processing exit work that is enqueued if we're preempted here.
-        * In particular, returning to userspace with any of the one-shot
-        * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-        * very bad.
+        * If we get here, then we know that pt_regs is clean for SYSRET64.
+        * If we see that no exit work is required (which we are required
+        * to check with IRQs off), then we can go straight to SYSRET64.
          */
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
         testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-       jnz     int_ret_from_sys_call_irqs_off  /* Go to the slow path */
+       jnz     1f
  
-       RESTORE_C_REGS_EXCEPT_RCX_R11
+       LOCKDEP_SYS_EXIT
+       TRACE_IRQS_ON           /* user mode is traced as IRQs on */
         movq    RIP(%rsp), %rcx
         movq    EFLAGS(%rsp), %r11
+       RESTORE_C_REGS_EXCEPT_RCX_R11
         movq    RSP(%rsp), %rsp
-       /*
-        * 64-bit SYSRET restores rip from rcx,
-        * rflags from r11 (but RF and VM bits are forced to 0),
-        * cs and ss are loaded from MSRs.
-        * Restoration of rflags re-enables interrupts.
-        *
-        * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-        * descriptor is not reinitialized.  This means that we should
-        * avoid SYSRET with SS == NULL, which could happen if we schedule,
-        * exit the kernel, and re-enter using an interrupt vector.  (All
-        * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-        * from happening by reloading SS in __switch_to.  (Actually
-        * detecting the failure in 64-bit userspace is tricky but can be
-        * done.)
-        */
         USERGS_SYSRET64
  
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+       /*
+        * The fast path looked good when we started, but something changed
+        * along the way and we need to switch to the slow path.  Calling
+        * raise(3) will trigger this, for example.  IRQs are off.
+        */
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
-       jmp int_ret_from_sys_call
-
-       /* Do syscall entry tracing */
-tracesys:
-       movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       call    syscall_trace_enter_phase1
-       test    %rax, %rax
-       jnz     tracesys_phase2                 /* if needed, run the slow path */
-       RESTORE_C_REGS_EXCEPT_RAX               /* else restore clobbered regs */
-       movq    ORIG_RAX(%rsp), %rax
-       jmp     entry_SYSCALL_64_fastpath       /* and return to the fast path */
-
-tracesys_phase2:
         SAVE_EXTRA_REGS
         movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       movq    %rax, %rdx
-       call    syscall_trace_enter_phase2
-
-       /*
-        * Reload registers from stack in case ptrace changed them.
-        * We don't reload %rax because syscall_trace_entry_phase2() returned
-        * the value it wants us to use in the table lookup.
-        */
-       RESTORE_C_REGS_EXCEPT_RAX
-       RESTORE_EXTRA_REGS
-#if __SYSCALL_MASK == ~0
-       cmpq    $__NR_syscall_max, %rax
-#else
-       andl    $__SYSCALL_MASK, %eax
-       cmpl    $__NR_syscall_max, %eax
-#endif
-       ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
-       movq    %r10, %rcx                      /* fixup for C */
-       call    *sys_call_table(, %rax, 8)
-       movq    %rax, RAX(%rsp)
-1:
-       /* Use IRET because user could have changed pt_regs->foo */
+       call    syscall_return_slowpath /* returns with IRQs disabled */
+       jmp     return_from_SYSCALL_64
  
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+       /* IRQs are off. */
         SAVE_EXTRA_REGS
         movq    %rsp, %rdi
-       call    syscall_return_slowpath /* returns with IRQs disabled */
+       call    do_syscall_64           /* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
         RESTORE_EXTRA_REGS
         TRACE_IRQS_IRETQ                /* we're about to change IF */
  
@@ -355,83 +324,45 @@ opportunistic_sysret_failed:
         jmp     restore_c_regs_and_iret
  END(entry_SYSCALL_64)
  
+ENTRY(stub_ptregs_64)
+       /*
+        * Syscalls marked as needing ptregs land here.
+        * If we are on the fast path, we need to save the extra regs,
+        * which we achieve by trying again on the slow path.  If we are on
+        * the slow path, the extra regs are already saved.
+        *
+        * RAX stores a pointer to the C function implementing the syscall.
+        * IRQs are on.
+        */
+       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+       jne     1f
  
-       .macro FORK_LIKE func
-ENTRY(stub_\func)
-       SAVE_EXTRA_REGS 8
-       jmp     sys_\func
-END(stub_\func)
-       .endm
-
-       FORK_LIKE  clone
-       FORK_LIKE  fork
-       FORK_LIKE  vfork
-
-ENTRY(stub_execve)
-       call    sys_execve
-return_from_execve:
-       testl   %eax, %eax
-       jz      1f
-       /* exec failed, can use fast SYSRET code path in this case */
-       ret
-1:
-       /* must use IRET code path (pt_regs->cs may have changed) */
-       addq    $8, %rsp
-       ZERO_EXTRA_REGS
-       movq    %rax, RAX(%rsp)
-       jmp     int_ret_from_sys_call
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
-       .align  8
-GLOBAL(stub_execveat)
-       call    sys_execveat
-       jmp     return_from_execve
-END(stub_execveat)
-
-#if defined(CONFIG_X86_X32_ABI)
-       .align  8
-GLOBAL(stub_x32_execve)
-       call    compat_sys_execve
-       jmp     return_from_execve
-END(stub_x32_execve)
-       .align  8
-GLOBAL(stub_x32_execveat)
-       call    compat_sys_execveat
-       jmp     return_from_execve
-END(stub_x32_execveat)
-#endif
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
         /*
-        * SAVE_EXTRA_REGS result is not normally needed:
-        * sigreturn overwrites all pt_regs->GPREGS.
-        * But sigreturn can fail (!), and there is no easy way to detect that.
-        * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
-        * we SAVE_EXTRA_REGS here.
+        * Called from fast path -- disable IRQs again, pop return address
+        * and jump to slow path
          */
-       SAVE_EXTRA_REGS 8
-       call    sys_rt_sigreturn
-return_from_stub:
-       addq    $8, %rsp
-       RESTORE_EXTRA_REGS
-       movq    %rax, RAX(%rsp)
-       jmp     int_ret_from_sys_call
-END(stub_rt_sigreturn)
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       popq    %rax
+       jmp     entry_SYSCALL64_slow_path
  
-#ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_rt_sigreturn)
-       SAVE_EXTRA_REGS 8
-       call    sys32_x32_rt_sigreturn
-       jmp     return_from_stub
-END(stub_x32_rt_sigreturn)
-#endif
+1:
+       /* Called from C */
+       jmp     *%rax                           /* called from C */
+END(stub_ptregs_64)
+
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+       leaq    \func(%rip), %rax
+       jmp     stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
  
  /*
   * A newly forked process directly context switches into this address.
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn)
   * rdi: prev task we switched from
   */
  ENTRY(ret_from_fork)
-
         LOCK ; btr $TIF_FORK, TI_flags(%r8)
  
         pushq   $0x0002
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork)
  
         call    schedule_tail                   /* rdi: 'prev' task parameter */
  
-       RESTORE_EXTRA_REGS
-
         testb   $3, CS(%rsp)                    /* from kernel_thread? */
+       jnz     1f
  
         /*
-        * By the time we get here, we have no idea whether our pt_regs,
-        * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-        * the slow path, or one of the 32-bit compat paths.
-        * Use IRET code path to return, since it can safely handle
-        * all of the above.
+        * We came from kernel_thread.  This code path is quite twisted, and
+        * someone should clean it up.
+        *
+        * copy_thread_tls stashes the function pointer in RBX and the
+        * parameter to be passed in RBP.  The called function is permitted
+        * to call do_execve and thereby jump to user mode.
          */
-       jnz     int_ret_from_sys_call
+       movq    RBP(%rsp), %rdi
+       call    *RBX(%rsp)
+       movl    $0, RAX(%rsp)
  
         /*
-        * We came from kernel_thread
-        * nb: we depend on RESTORE_EXTRA_REGS above
+        * Fall through as though we're exiting a syscall.  This makes a
+        * twisted sort of sense if we just called do_execve.
          */
-       movq    %rbp, %rdi
-       call    *%rbx
-       movl    $0, RAX(%rsp)
-       RESTORE_EXTRA_REGS
-       jmp     int_ret_from_sys_call
+
+1:
+       movq    %rsp, %rdi
+       call    syscall_return_slowpath /* returns with IRQs disabled */
+       TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
+       SWAPGS
+       jmp     restore_regs_and_iret
  END(ret_from_fork)
  
  /*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S

index 3c990eeee40bd6e83b4d830654fdf966250bb0e1..847f2f0c31e50d1029a2e39600ed86e772ef8847 100644 (file)
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
         .section .entry.text, "ax"
  
  /*
- * 32-bit SYSENTER instruction entry.
+ * 32-bit SYSENTER entry.
   *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
   * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
   *
   * Arguments:
   * eax  system call number
@@ -35,10 +44,6 @@
   * edi  arg5
   * ebp  user stack
   * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
   */
  ENTRY(entry_SYSENTER_compat)
         /* Interrupts are off on entry. */
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
          */
         pushfq                          /* pt_regs->flags (except IF = 0) */
         orl     $X86_EFLAGS_IF, (%rsp)  /* Fix saved flags */
-       ASM_CLAC                        /* Clear AC after saving FLAGS */
-
         pushq   $__USER32_CS            /* pt_regs->cs */
         xorq    %r8,%r8
         pushq   %r8                     /* pt_regs->ip = 0 (placeholder) */
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
         cld
  
         /*
-        * Sysenter doesn't filter flags, so we need to clear NT
+        * SYSENTER doesn't filter flags, so we need to clear NT and AC
          * ourselves.  To save a few cycles, we can check whether
-        * NT was set instead of doing an unconditional popfq.
+        * either was set instead of doing an unconditional popfq.
          * This needs to happen before enabling interrupts so that
          * we don't get preempted with NT set.
          *
+        * If TF is set, we will single-step all the way to here -- do_debug
+        * will ignore all the traps.  (Yes, this is slow, but so is
+        * single-stepping in general.  This allows us to avoid having
+        * a more complicated code to handle the case where a user program
+        * forces us to single-step through the SYSENTER entry code.)
+        *
          * NB.: .Lsysenter_fix_flags is a label with the code under it moved
          * out-of-line as an optimization: NT is unlikely to be set in the
          * majority of the cases and instead of polluting the I$ unnecessarily,
          * we're keeping that code behind a branch which will predict as
          * not-taken and therefore its instructions won't be fetched.
          */
-       testl   $X86_EFLAGS_NT, EFLAGS(%rsp)
+       testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
         jnz     .Lsysenter_fix_flags
  .Lsysenter_flags_fixed:
  
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
         pushq   $X86_EFLAGS_FIXED
         popfq
         jmp     .Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
  ENDPROC(entry_SYSENTER_compat)
  
  /*
- * 32-bit SYSCALL instruction entry.
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ *  - The calling convention for SYSCALL has changed several times without
+ *    anyone noticing.
   *
- * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
+ *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ *    user task that did SYSCALL without immediately reloading SS
+ *    would randomly crash.
   *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
+ *  - Most programmers do not directly target AMD CPUs, and the 32-bit
+ *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD
+ *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ *    because the SYSCALL instruction in legacy/native 32-bit mode (as
+ *    opposed to compat mode) is sufficiently poorly designed as to be
+ *    essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs.  RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed).  SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
   * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
   * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
   * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
   *
@@ -236,7 +267,21 @@ sysret32_from_system_call:
  END(entry_SYSCALL_compat)
  
  /*
- * Emulated IA32 system calls via int 0x80.
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls.  Instances of INT $0x80 can be found inline in
+ * various programs and libraries.  It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method.  Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
   *
   * Arguments:
   * eax  system call number
@@ -245,17 +290,8 @@ END(entry_SYSCALL_compat)
   * edx  arg3
   * esi  arg4
   * edi  arg5
- * ebp  arg6   (note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
+ * ebp  arg6
   */
-
  ENTRY(entry_INT80_compat)
         /*
          * Interrupts are off on entry.
@@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat)
         TRACE_IRQS_OFF
  
         movq    %rsp, %rdi
-       call    do_syscall_32_irqs_off
+       call    do_int80_syscall_32
  .Lsyscall_32_done:
  
         /* Go back to user mode. */
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c

index 9a6649857106a0e0bc6bfe8ad0074562e394a5d1..8f895ee13a1cbfa1f1666a251471d29d2db01ea2 100644 (file)
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
  #include <asm/asm-offsets.h>
  #include <asm/syscall.h>
  
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
  #include <asm/syscalls_32.h>
  #undef __SYSCALL_I386
  
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
  
  extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
  
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c

index 41283d22be7a92db23e46fb427593a2e878ee37d..9dbc5abb6162fa20581069499667a8c49b254868 100644 (file)
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,19 +6,14 @@
  #include <asm/asm-offsets.h>
  #include <asm/syscall.h>
  
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
  
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
  #include <asm/syscalls_64.h>
  #undef __SYSCALL_64
  
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
  
  extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
  
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl

index dc1040a50bdc21594317f9f9dc4c59196a25ae63..2e5b565adacc542415854612ad752e04cfc5df95 100644 (file)
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
  12     common  brk                     sys_brk
  13     64      rt_sigaction            sys_rt_sigaction
  14     common  rt_sigprocmask          sys_rt_sigprocmask
-15     64      rt_sigreturn            stub_rt_sigreturn
+15     64      rt_sigreturn            sys_rt_sigreturn/ptregs
  16     64      ioctl                   sys_ioctl
  17     common  pread64                 sys_pread64
  18     common  pwrite64                sys_pwrite64
@@ -62,10 +62,10 @@
  53     common  socketpair              sys_socketpair
  54     64      setsockopt              sys_setsockopt
  55     64      getsockopt              sys_getsockopt
-56     common  clone                   stub_clone
-57     common  fork                    stub_fork
-58     common  vfork                   stub_vfork
-59     64      execve                  stub_execve
+56     common  clone                   sys_clone/ptregs
+57     common  fork                    sys_fork/ptregs
+58     common  vfork                   sys_vfork/ptregs
+59     64      execve                  sys_execve/ptregs
  60     common  exit                    sys_exit
  61     common  wait4                   sys_wait4
  62     common  kill                    sys_kill
@@ -178,7 +178,7 @@
  169    common  reboot                  sys_reboot
  170    common  sethostname             sys_sethostname
  171    common  setdomainname           sys_setdomainname
-172    common  iopl                    sys_iopl
+172    common  iopl                    sys_iopl/ptregs
  173    common  ioperm                  sys_ioperm
  174    64      create_module
  175    common  init_module             sys_init_module
@@ -328,7 +328,7 @@
  319    common  memfd_create            sys_memfd_create
  320    common  kexec_file_load         sys_kexec_file_load
  321    common  bpf                     sys_bpf
-322    64      execveat                stub_execveat
+322    64      execveat                sys_execveat/ptregs
  323    common  userfaultfd             sys_userfaultfd
  324    common  membarrier              sys_membarrier
  325    common  mlock2                  sys_mlock2
@@ -339,14 +339,14 @@
  # for native 64-bit operation.
  #
  512    x32     rt_sigaction            compat_sys_rt_sigaction
-513    x32     rt_sigreturn            stub_x32_rt_sigreturn
+513    x32     rt_sigreturn            sys32_x32_rt_sigreturn
  514    x32     ioctl                   compat_sys_ioctl
  515    x32     readv                   compat_sys_readv
  516    x32     writev                  compat_sys_writev
  517    x32     recvfrom                compat_sys_recvfrom
  518    x32     sendmsg                 compat_sys_sendmsg
  519    x32     recvmsg                 compat_sys_recvmsg
-520    x32     execve                  stub_x32_execve
+520    x32     execve                  compat_sys_execve/ptregs
  521    x32     ptrace                  compat_sys_ptrace
  522    x32     rt_sigpending           compat_sys_rt_sigpending
  523    x32     rt_sigtimedwait         compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
  542    x32     getsockopt              compat_sys_getsockopt
  543    x32     io_setup                compat_sys_io_setup
  544    x32     io_submit               compat_sys_io_submit
-545    x32     execveat                stub_x32_execveat
+545    x32     execveat                compat_sys_execveat/ptregs
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh

index 0e7f8ec071e76217a6cee34bc90d0ca7b997443e..cd3d3015d7df87ed79f469c7be0ce6549c62433f 100644 (file)
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,63 @@
  in="$1"
  out="$2"
  
+syscall_macro() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+
+    # Entry can be either just a function name or "function/qualifier"
+    real_entry="${entry%%/*}"
+    qualifier="${entry:${#real_entry}}"                # Strip the function name
+    qualifier="${qualifier:1}"                 # Strip the slash, if any
+
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
+emit() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    compat="$4"
+
+    if [ "$abi" == "64" -a -n "$compat" ]; then
+       echo "a compat entry for a 64-bit syscall makes no sense" >&2
+       exit 1
+    fi
+
+    if [ -z "$compat" ]; then
+       if [ -n "$entry" ]; then
+           syscall_macro "$abi" "$nr" "$entry"
+       fi
+    else
+       echo "#ifdef CONFIG_X86_32"
+       if [ -n "$entry" ]; then
+           syscall_macro "$abi" "$nr" "$entry"
+       fi
+       echo "#else"
+       syscall_macro "$abi" "$nr" "$compat"
+       echo "#endif"
+    fi
+}
+
  grep '^[0-9]' "$in" | sort -n | (
      while read nr abi name entry compat; do
         abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-       if [ -n "$compat" ]; then
-           echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-       elif [ -n "$entry" ]; then
-           echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+       if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+           # COMMON is the same as 64, except that we don't expect X32
+           # programs to use it.  Our expectation has nothing to do with
+           # any generated code, so treat them the same.
+           emit 64 "$nr" "$entry" "$compat"
+       elif [ "$abi" == "X32" ]; then
+           # X32 is equivalent to 64 on an X32-compatible kernel.
+           echo "#ifdef CONFIG_X86_X32_ABI"
+           emit 64 "$nr" "$entry" "$compat"
+           echo "#endif"
+       elif [ "$abi" == "I386" ]; then
+           emit "$abi" "$nr" "$entry" "$compat"
+       else
+           echo "Unknown abi $abi" >&2
+           exit 1
         fi
      done
  ) > "$out"
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h

index 0224987556ce80bd606063b56ef124b0857a3f44..63a03bb91497e63d56e1ba329f9da7b80e751e1d 100644 (file)
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
         fprintf(outfile, "#include <asm/vdso.h>\n");
         fprintf(outfile, "\n");
         fprintf(outfile,
-               "static unsigned char raw_data[%lu] __page_aligned_data = {",
+               "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
                 mapping_size);
         for (j = 0; j < stripped_len; j++) {
                 if (j % 10 == 0)
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
         }
         fprintf(outfile, "\n};\n\n");
  
-       fprintf(outfile, "static struct page *pages[%lu];\n\n",
-               mapping_size / 4096);
-
         fprintf(outfile, "const struct vdso_image %s = {\n", name);
         fprintf(outfile, "\t.data = raw_data,\n");
         fprintf(outfile, "\t.size = %lu,\n", mapping_size);
-       fprintf(outfile, "\t.text_mapping = {\n");
-       fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
-       fprintf(outfile, "\t\t.pages = pages,\n");
-       fprintf(outfile, "\t},\n");
         if (alt_sec) {
                 fprintf(outfile, "\t.alt = %lu,\n",
                         (unsigned long)GET_LE(&alt_sec->sh_offset));
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c

index 08a317a9ae4b582974ec4af4842461bddb762121..7853b53959cd35a8d555a5ddebfbb4af1f40562a 100644 (file)
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,7 +11,6 @@
  #include <linux/kernel.h>
  #include <linux/mm_types.h>
  
-#include <asm/cpufeature.h>
  #include <asm/processor.h>
  #include <asm/vdso.h>
  
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S

index 3a1d9297074bc5e1d2e559735bb5247acffa6164..0109ac6cb79cc73a5d74c99bc2c97705a9ca5c7d 100644 (file)
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -3,7 +3,7 @@
  */
  
  #include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  
  /*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c

index b8f69e264ac4148afbdaeedc69e24132980117a9..10f704584922653fd208646cac11c4f8a9cd776b 100644 (file)
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -20,6 +20,7 @@
  #include <asm/page.h>
  #include <asm/hpet.h>
  #include <asm/desc.h>
+#include <asm/cpufeature.h>
  
  #if defined(CONFIG_X86_64)
  unsigned int __read_mostly vdso64_enabled = 1;
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
  
  void __init init_vdso_image(const struct vdso_image *image)
  {
-       int i;
-       int npages = (image->size) / PAGE_SIZE;
-
         BUG_ON(image->size % PAGE_SIZE != 0);
-       for (i = 0; i < npages; i++)
-               image->text_mapping.pages[i] =
-                       virt_to_page(image->data + i*PAGE_SIZE);
  
         apply_alternatives((struct alt_instr *)(image->data + image->alt),
                            (struct alt_instr *)(image->data + image->alt +
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
  #endif
  }
  
+static int vdso_fault(const struct vm_special_mapping *sm,
+                     struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+       if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+               return VM_FAULT_SIGBUS;
+
+       vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+       get_page(vmf->page);
+       return 0;
+}
+
+static const struct vm_special_mapping text_mapping = {
+       .name = "[vdso]",
+       .fault = vdso_fault,
+};
+
+static int vvar_fault(const struct vm_special_mapping *sm,
+                     struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+       long sym_offset;
+       int ret = -EFAULT;
+
+       if (!image)
+               return VM_FAULT_SIGBUS;
+
+       sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+               image->sym_vvar_start;
+
+       /*
+        * Sanity check: a symbol offset of zero means that the page
+        * does not exist for this vdso image, not that the page is at
+        * offset zero relative to the text mapping.  This should be
+        * impossible here, because sym_offset should only be zero for
+        * the page past the end of the vvar mapping.
+        */
+       if (sym_offset == 0)
+               return VM_FAULT_SIGBUS;
+
+       if (sym_offset == image->sym_vvar_page) {
+               ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+                                   __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+       } else if (sym_offset == image->sym_hpet_page) {
+#ifdef CONFIG_HPET_TIMER
+               if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
+                       ret = vm_insert_pfn_prot(
+                               vma,
+                               (unsigned long)vmf->virtual_address,
+                               hpet_address >> PAGE_SHIFT,
+                               pgprot_noncached(PAGE_READONLY));
+               }
+#endif
+       } else if (sym_offset == image->sym_pvclock_page) {
+               struct pvclock_vsyscall_time_info *pvti =
+                       pvclock_pvti_cpu0_va();
+               if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
+                       ret = vm_insert_pfn(
+                               vma,
+                               (unsigned long)vmf->virtual_address,
+                               __pa(pvti) >> PAGE_SHIFT);
+               }
+       }
+
+       if (ret == 0 || ret == -EBUSY)
+               return VM_FAULT_NOPAGE;
+
+       return VM_FAULT_SIGBUS;
+}
+
  static int map_vdso(const struct vdso_image *image, bool calculate_addr)
  {
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
         unsigned long addr, text_start;
         int ret = 0;
-       static struct page *no_pages[] = {NULL};
-       static struct vm_special_mapping vvar_mapping = {
+       static const struct vm_special_mapping vvar_mapping = {
                 .name = "[vvar]",
-               .pages = no_pages,
+               .fault = vvar_fault,
         };
-       struct pvclock_vsyscall_time_info *pvti;
  
         if (calculate_addr) {
                 addr = vdso_addr(current->mm->start_stack,
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
  
         text_start = addr - image->sym_vvar_start;
         current->mm->context.vdso = (void __user *)text_start;
+       current->mm->context.vdso_image = image;
  
         /*
          * MAYWRITE to allow gdb to COW and set breakpoints
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                                        image->size,
                                        VM_READ|VM_EXEC|
                                        VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-                                      &image->text_mapping);
+                                      &text_mapping);
  
         if (IS_ERR(vma)) {
                 ret = PTR_ERR(vma);
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
         vma = _install_special_mapping(mm,
                                        addr,
                                        -image->sym_vvar_start,
-                                      VM_READ|VM_MAYREAD,
+                                      VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+                                      VM_PFNMAP,
                                        &vvar_mapping);
  
         if (IS_ERR(vma)) {
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                 goto up_fail;
         }
  
-       if (image->sym_vvar_page)
-               ret = remap_pfn_range(vma,
-                                     text_start + image->sym_vvar_page,
-                                     __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
-                                     PAGE_SIZE,
-                                     PAGE_READONLY);
-
-       if (ret)
-               goto up_fail;
-
-#ifdef CONFIG_HPET_TIMER
-       if (hpet_address && image->sym_hpet_page) {
-               ret = io_remap_pfn_range(vma,
-                       text_start + image->sym_hpet_page,
-                       hpet_address >> PAGE_SHIFT,
-                       PAGE_SIZE,
-                       pgprot_noncached(PAGE_READONLY));
-
-               if (ret)
-                       goto up_fail;
-       }
-#endif
-
-       pvti = pvclock_pvti_cpu0_va();
-       if (pvti && image->sym_pvclock_page) {
-               ret = remap_pfn_range(vma,
-                                     text_start + image->sym_pvclock_page,
-                                     __pa(pvti) >> PAGE_SHIFT,
-                                     PAGE_SIZE,
-                                     PAGE_READONLY);
-
-               if (ret)
-                       goto up_fail;
-       }
-
  up_fail:
         if (ret)
                 current->mm->context.vdso = NULL;
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
  #ifdef CONFIG_NUMA
         node = cpu_to_node(cpu);
  #endif
-       if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+       if (static_cpu_has(X86_FEATURE_RDTSCP))
                 write_rdtscp_aux((node << 12) | cpu);
  
         /*
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c

index 51e3304169951619362ea4a1494716e4f20696bf..0fb3a104ac626b07e0a4e604a8ebf36764bea2cf 100644 (file)
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -16,6 +16,8 @@
  #include <asm/vgtod.h>
  #include <asm/vvar.h>
  
+int vclocks_used __read_mostly;
+
  DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
  
  void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
  
  void update_vsyscall(struct timekeeper *tk)
  {
+       int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
         struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
  
+       /* Mark the new vclock used. */
+       BUILD_BUG_ON(VCLOCK_MAX >= 32);
+       WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
+
         gtod_write_begin(vdata);
  
         /* copy vsyscall data */
-       vdata->vclock_mode      = tk->tkr_mono.clock->archdata.vclock_mode;
+       vdata->vclock_mode      = vclock_mode;
         vdata->cycle_last       = tk->tkr_mono.cycle_last;
         vdata->mask             = tk->tkr_mono.mask;
         vdata->mult             = tk->tkr_mono.mult;
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile

new file mode 100644 (file)

index 0000000..fdfea15
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1,13 @@
+obj-y                  += core.o
+
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
+endif
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/core.o intel/bts.o intel/cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/cstate.o intel/ds.o intel/knc.o 
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/rapl.o msr.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c

new file mode 100644 (file)

index 0000000..049ada8
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,731 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "../perf_event.h"
+
+static __initconst const u64 amd_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+               [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]                   = 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]                 = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]             = 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]                 = 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]          = 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]                        = 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]      = 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]       = 0x00d1, /* "Dispatch stalls" event */
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+       return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+       int offset;
+
+       if (!index)
+               return index;
+
+       if (eventsel)
+               offset = event_offsets[index];
+       else
+               offset = count_offsets[index];
+
+       if (offset)
+               return offset;
+
+       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               offset = index;
+       else
+               offset = index << 1;
+
+       if (eventsel)
+               event_offsets[index] = offset;
+       else
+               count_offsets[index] = offset;
+
+       return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+       if (event->attr.exclude_host && event->attr.exclude_guest)
+               /*
+                * When HO == GO == 1 the hardware treats that as GO == HO == 0
+                * and will count in both modes. We don't want to count in that
+                * case so we emulate no-counting by setting US = OS = 0.
+                */
+               event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+                                     ARCH_PERFMON_EVENTSEL_OS);
+       else if (event->attr.exclude_host)
+               event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+       else if (event->attr.exclude_guest)
+               event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+       return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+       return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+       return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+       struct amd_nb *nb = cpuc->amd_nb;
+
+       return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+       int ret;
+
+       /* pass precise event sampling to ibs: */
+       if (event->attr.precise_ip && get_ibs_caps())
+               return -ENOENT;
+
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       ret = x86_pmu_hw_config(event);
+       if (ret)
+               return ret;
+
+       if (event->attr.type == PERF_TYPE_RAW)
+               event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+       return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+                                          struct perf_event *event)
+{
+       struct amd_nb *nb = cpuc->amd_nb;
+       int i;
+
+       /*
+        * need to scan whole list because event may not have
+        * been assigned during scheduling
+        *
+        * no race condition possible because event can only
+        * be removed on one CPU at a time AND PMU is disabled
+        * when we come here
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (cmpxchg(nb->owners + i, event, NULL) == event)
+                       break;
+       }
+}
+
+ /*
+  * AMD64 NorthBridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code >= 0xe00.
+  * They measure events on the NorthBride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  *
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will eventually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling __amd_put_nb_event_constraints()
+  *
+  * Non NB events are not impacted by this restriction.
+  */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                              struct event_constraint *c)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct amd_nb *nb = cpuc->amd_nb;
+       struct perf_event *old;
+       int idx, new = -1;
+
+       if (!c)
+               c = &unconstrained;
+
+       if (cpuc->is_fake)
+               return c;
+
+       /*
+        * detect if already present, if so reuse
+        *
+        * cannot merge with actual allocation
+        * because of possible holes
+        *
+        * event can already be present yet not assigned (in hwc->idx)
+        * because of successive calls to x86_schedule_events() from
+        * hw_perf_group_sched_in() without hw_perf_enable()
+        */
+       for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+               if (new == -1 || hwc->idx == idx)
+                       /* assign free slot, prefer hwc->idx */
+                       old = cmpxchg(nb->owners + idx, NULL, event);
+               else if (nb->owners[idx] == event)
+                       /* event already present */
+                       old = event;
+               else
+                       continue;
+
+               if (old && old != event)
+                       continue;
+
+               /* reassign to this slot */
+               if (new != -1)
+                       cmpxchg(nb->owners + new, event, NULL);
+               new = idx;
+
+               /* already present, reuse */
+               if (old == event)
+                       break;
+       }
+
+       if (new == -1)
+               return &emptyconstraint;
+
+       return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+       struct amd_nb *nb;
+       int i;
+
+       nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+       if (!nb)
+               return NULL;
+
+       nb->nb_id = -1;
+
+       /*
+        * initialize all possible NB constraints
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               __set_bit(i, nb->event_constraints[i].idxmsk);
+               nb->event_constraints[i].weight = 1;
+       }
+       return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       WARN_ON_ONCE(cpuc->amd_nb);
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return NOTIFY_OK;
+
+       cpuc->amd_nb = amd_alloc_nb(cpu);
+       if (!cpuc->amd_nb)
+               return NOTIFY_BAD;
+
+       return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+       struct amd_nb *nb;
+       int i, nb_id;
+
+       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return;
+
+       nb_id = amd_get_nb_id(cpu);
+       WARN_ON_ONCE(nb_id == BAD_APICID);
+
+       for_each_online_cpu(i) {
+               nb = per_cpu(cpu_hw_events, i).amd_nb;
+               if (WARN_ON_ONCE(!nb))
+                       continue;
+
+               if (nb->nb_id == nb_id) {
+                       *onln = cpuc->amd_nb;
+                       cpuc->amd_nb = nb;
+                       break;
+               }
+       }
+
+       cpuc->amd_nb->nb_id = nb_id;
+       cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+       struct cpu_hw_events *cpuhw;
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return;
+
+       cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+       if (cpuhw->amd_nb) {
+               struct amd_nb *nb = cpuhw->amd_nb;
+
+               if (nb->nb_id == -1 || --nb->refcnt == 0)
+                       kfree(nb);
+
+               cpuhw->amd_nb = NULL;
+       }
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       /*
+        * if not NB event or no NB, then no constraints
+        */
+       if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+               return &unconstrained;
+
+       return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+                                     struct perf_event *event)
+{
+       if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+               __amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *amd_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK    0x000000F0ULL
+
+#define AMD_EVENT_FP           0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS           0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC           0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU           0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE                0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS                0x000000C0ULL
+#define AMD_EVENT_DE           0x000000D0ULL
+#define AMD_EVENT_NB           0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000       FP      PERF_CTL[5:3]
+ * 0x010       FP      PERF_CTL[5:3]
+ * 0x020       LS      PERF_CTL[5:0]
+ * 0x030       LS      PERF_CTL[5:0]
+ * 0x040       DC      PERF_CTL[5:0]
+ * 0x050       DC      PERF_CTL[5:0]
+ * 0x060       CU      PERF_CTL[2:0]
+ * 0x070       CU      PERF_CTL[2:0]
+ * 0x080       IC/DE   PERF_CTL[2:0]
+ * 0x090       IC/DE   PERF_CTL[2:0]
+ * 0x0A0       ---
+ * 0x0B0       ---
+ * 0x0C0       EX/LS   PERF_CTL[5:0]
+ * 0x0D0       DE      PERF_CTL[2:0]
+ * 0x0E0       NB      NB_PERF_CTL[3:0]
+ * 0x0F0       NB      NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003       FP      PERF_CTL[3]
+ * 0x004       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B       FP      PERF_CTL[3]
+ * 0x00D       FP      PERF_CTL[3]
+ * 0x023       DE      PERF_CTL[2:0]
+ * 0x02D       LS      PERF_CTL[3]
+ * 0x02E       LS      PERF_CTL[3,0]
+ * 0x031       LS      PERF_CTL[2:0] (**)
+ * 0x043       CU      PERF_CTL[2:0]
+ * 0x045       CU      PERF_CTL[2:0]
+ * 0x046       CU      PERF_CTL[2:0]
+ * 0x054       CU      PERF_CTL[2:0]
+ * 0x055       CU      PERF_CTL[2:0]
+ * 0x08F       IC      PERF_CTL[0]
+ * 0x187       DE      PERF_CTL[0]
+ * 0x188       DE      PERF_CTL[0]
+ * 0x0DB       EX      PERF_CTL[5:0]
+ * 0x0DC       LS      PERF_CTL[5:0]
+ * 0x0DD       LS      PERF_CTL[5:0]
+ * 0x0DE       LS      PERF_CTL[5:0]
+ * 0x0DF       LS      PERF_CTL[5:0]
+ * 0x1C0       EX      PERF_CTL[5:3]
+ * 0x1D6       EX      PERF_CTL[5:0]
+ * 0x1D8       EX      PERF_CTL[5:0]
+ *
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+                              struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned int event_code = amd_get_event_code(hwc);
+
+       switch (event_code & AMD_EVENT_TYPE_MASK) {
+       case AMD_EVENT_FP:
+               switch (event_code) {
+               case 0x000:
+                       if (!(hwc->config & 0x0000F000ULL))
+                               break;
+                       if (!(hwc->config & 0x00000F00ULL))
+                               break;
+                       return &amd_f15_PMC3;
+               case 0x004:
+                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                               break;
+                       return &amd_f15_PMC3;
+               case 0x003:
+               case 0x00B:
+               case 0x00D:
+                       return &amd_f15_PMC3;
+               }
+               return &amd_f15_PMC53;
+       case AMD_EVENT_LS:
+       case AMD_EVENT_DC:
+       case AMD_EVENT_EX_LS:
+               switch (event_code) {
+               case 0x023:
+               case 0x043:
+               case 0x045:
+               case 0x046:
+               case 0x054:
+               case 0x055:
+                       return &amd_f15_PMC20;
+               case 0x02D:
+                       return &amd_f15_PMC3;
+               case 0x02E:
+                       return &amd_f15_PMC30;
+               case 0x031:
+                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                               return &amd_f15_PMC20;
+                       return &emptyconstraint;
+               case 0x1C0:
+                       return &amd_f15_PMC53;
+               default:
+                       return &amd_f15_PMC50;
+               }
+       case AMD_EVENT_CU:
+       case AMD_EVENT_IC_DE:
+       case AMD_EVENT_DE:
+               switch (event_code) {
+               case 0x08F:
+               case 0x187:
+               case 0x188:
+                       return &amd_f15_PMC0;
+               case 0x0DB ... 0x0DF:
+               case 0x1D6:
+               case 0x1D8:
+                       return &amd_f15_PMC50;
+               default:
+                       return &amd_f15_PMC20;
+               }
+       case AMD_EVENT_NB:
+               /* moved to perf_event_amd_uncore.c */
+               return &emptyconstraint;
+       default:
+               return &emptyconstraint;
+       }
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+                   (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+       return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+       .name                   = "AMD",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = x86_pmu_disable_all,
+       .enable_all             = x86_pmu_enable_all,
+       .enable                 = x86_pmu_enable_event,
+       .disable                = x86_pmu_disable_event,
+       .hw_config              = amd_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_K7_EVNTSEL0,
+       .perfctr                = MSR_K7_PERFCTR0,
+       .addr_offset            = amd_pmu_addr_offset,
+       .event_map              = amd_pmu_event_map,
+       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
+       .num_counters           = AMD64_NUM_COUNTERS,
+       .cntval_bits            = 48,
+       .cntval_mask            = (1ULL << 48) - 1,
+       .apic                   = 1,
+       /* use highest bit to detect overflow */
+       .max_period             = (1ULL << 47) - 1,
+       .get_event_constraints  = amd_get_event_constraints,
+       .put_event_constraints  = amd_put_event_constraints,
+
+       .format_attrs           = amd_format_attr,
+       .events_sysfs_show      = amd_event_sysfs_show,
+
+       .cpu_prepare            = amd_pmu_cpu_prepare,
+       .cpu_starting           = amd_pmu_cpu_starting,
+       .cpu_dead               = amd_pmu_cpu_dead,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               return 0;
+
+       switch (boot_cpu_data.x86) {
+       case 0x15:
+               pr_cont("Fam15h ");
+               x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+               break;
+
+       default:
+               pr_err("core perfctr but no constraints; unknown hardware!\n");
+               return -ENODEV;
+       }
+
+       /*
+        * If core performance counter extensions exists, we must use
+        * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+        * amd_pmu_addr_offset().
+        */
+       x86_pmu.eventsel        = MSR_F15H_PERF_CTL;
+       x86_pmu.perfctr         = MSR_F15H_PERF_CTR;
+       x86_pmu.num_counters    = AMD64_NUM_COUNTERS_CORE;
+
+       pr_cont("core perfctr, ");
+       return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+       int ret;
+
+       /* Performance-monitoring supported from K7 and later: */
+       if (boot_cpu_data.x86 < 6)
+               return -ENODEV;
+
+       x86_pmu = amd_pmu;
+
+       ret = amd_core_pmu_init();
+       if (ret)
+               return ret;
+
+       /* Events are common for all AMDs */
+       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+              sizeof(hw_cache_event_ids));
+
+       return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       cpuc->perf_ctr_virt_mask = 0;
+
+       /* Reload all events */
+       x86_pmu_disable_all();
+       x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * We only mask out the Host-only bit so that host-only counting works
+        * when SVM is disabled. If someone sets up a guest-only counter when
+        * SVM is disabled the Guest-only bits still gets set and the counter
+        * will not count anything.
+        */
+       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+       /* Reload all events */
+       x86_pmu_disable_all();
+       x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c

new file mode 100644 (file)

index 0000000..51087c2
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,959 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK  (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK     IBS_OP_MAX_CNT
+
+enum ibs_states {
+       IBS_ENABLED     = 0,
+       IBS_STARTED     = 1,
+       IBS_STOPPING    = 2,
+
+       IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+       struct perf_event       *event;
+       unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+       struct pmu                      pmu;
+       unsigned int                    msr;
+       u64                             config_mask;
+       u64                             cnt_mask;
+       u64                             enable_mask;
+       u64                             valid_mask;
+       u64                             max_period;
+       unsigned long                   offset_mask[1];
+       int                             offset_max;
+       struct cpu_perf_ibs __percpu    *pcpu;
+
+       struct attribute                **format_attrs;
+       struct attribute_group          format_group;
+       const struct attribute_group    *attr_groups[2];
+
+       u64                             (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+       u32             size;
+       union {
+               u32     data[0];        /* data buffer starts here */
+               u32     caps;
+       };
+       u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+       s64 left = local64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int overflow = 0;
+
+       /*
+        * If we are way outside a reasonable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               overflow = 1;
+       }
+
+       if (unlikely(left < (s64)min)) {
+               left += period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               overflow = 1;
+       }
+
+       /*
+        * If the hw period that triggers the sw overflow is too short
+        * we might hit the irq handler. This biases the results.
+        * Thus we shorten the next-to-last period and set the last
+        * period to the max period.
+        */
+       if (left > max) {
+               left -= max;
+               if (left > max)
+                       left = max;
+               else if (left < min)
+                       left = min;
+       }
+
+       *hw_period = (u64)left;
+
+       return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int shift = 64 - width;
+       u64 prev_raw_count;
+       u64 delta;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+       prev_raw_count = local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               return 0;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
+
+       return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+       if (perf_ibs_fetch.pmu.type == type)
+               return &perf_ibs_fetch;
+       if (perf_ibs_op.pmu.type == type)
+               return &perf_ibs_op;
+       return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+       switch (event->attr.precise_ip) {
+       case 0:
+               return -ENOENT;
+       case 1:
+       case 2:
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       switch (event->attr.type) {
+       case PERF_TYPE_HARDWARE:
+               switch (event->attr.config) {
+               case PERF_COUNT_HW_CPU_CYCLES:
+                       *config = 0;
+                       return 0;
+               }
+               break;
+       case PERF_TYPE_RAW:
+               switch (event->attr.config) {
+               case 0x0076:
+                       *config = 0;
+                       return 0;
+               case 0x00C1:
+                       *config = IBS_OP_CNT_CTL;
+                       return 0;
+               }
+               break;
+       default:
+               return -ENOENT;
+       }
+
+       return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+       .exclude_user   = 1,
+       .exclude_kernel = 1,
+       .exclude_hv     = 1,
+       .exclude_idle   = 1,
+       .exclude_host   = 1,
+       .exclude_guest  = 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs;
+       u64 max_cnt, config;
+       int ret;
+
+       perf_ibs = get_ibs_pmu(event->attr.type);
+       if (perf_ibs) {
+               config = event->attr.config;
+       } else {
+               perf_ibs = &perf_ibs_op;
+               ret = perf_ibs_precise_event(event, &config);
+               if (ret)
+                       return ret;
+       }
+
+       if (event->pmu != &perf_ibs->pmu)
+               return -ENOENT;
+
+       if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+               return -EINVAL;
+
+       if (config & ~perf_ibs->config_mask)
+               return -EINVAL;
+
+       if (hwc->sample_period) {
+               if (config & perf_ibs->cnt_mask)
+                       /* raw max_cnt may not be set */
+                       return -EINVAL;
+               if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+                       /*
+                        * lower 4 bits can not be set in ibs max cnt,
+                        * but allowing it in case we adjust the
+                        * sample period to set a frequency.
+                        */
+                       return -EINVAL;
+               hwc->sample_period &= ~0x0FULL;
+               if (!hwc->sample_period)
+                       hwc->sample_period = 0x10;
+       } else {
+               max_cnt = config & perf_ibs->cnt_mask;
+               config &= ~perf_ibs->cnt_mask;
+               event->attr.sample_period = max_cnt << 4;
+               hwc->sample_period = event->attr.sample_period;
+       }
+
+       if (!hwc->sample_period)
+               return -EINVAL;
+
+       /*
+        * If we modify hwc->sample_period, we also need to update
+        * hwc->last_period and hwc->period_left.
+        */
+       hwc->last_period = hwc->sample_period;
+       local64_set(&hwc->period_left, hwc->sample_period);
+
+       hwc->config_base = perf_ibs->msr;
+       hwc->config = config;
+
+       return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+                              struct hw_perf_event *hwc, u64 *period)
+{
+       int overflow;
+
+       /* ignore lower 4 bits in min count: */
+       overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+       local64_set(&hwc->prev_count, 0);
+
+       return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+       return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+       u64 count = 0;
+
+       if (config & IBS_OP_VAL)
+               count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+       if (ibs_caps & IBS_CAPS_RDWROPCNT)
+               count += (config & IBS_OP_CUR_CNT) >> 32;
+
+       return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+                     u64 *config)
+{
+       u64 count = perf_ibs->get_count(*config);
+
+       /*
+        * Set width to 64 since we do not overflow on max width but
+        * instead on max count. In perf_ibs_set_period() we clear
+        * prev count manually on overflow.
+        */
+       while (!perf_event_try_update(event, count, 64)) {
+               rdmsrl(event->hw.config_base, *config);
+               count = perf_ibs->get_count(*config);
+       }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+                                        struct hw_perf_event *hwc, u64 config)
+{
+       wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+                                         struct hw_perf_event *hwc, u64 config)
+{
+       config &= ~perf_ibs->cnt_mask;
+       wrmsrl(hwc->config_base, config);
+       config &= ~perf_ibs->enable_mask;
+       wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       u64 period;
+
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+       hwc->state = 0;
+
+       perf_ibs_set_period(perf_ibs, hwc, &period);
+       set_bit(IBS_STARTED, pcpu->state);
+       perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+       perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       u64 config;
+       int stopping;
+
+       stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+       if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+               return;
+
+       rdmsrl(hwc->config_base, config);
+
+       if (stopping) {
+               set_bit(IBS_STOPPING, pcpu->state);
+               perf_ibs_disable_event(perf_ibs, hwc, config);
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       /*
+        * Clear valid bit to not count rollovers on update, rollovers
+        * are only updated in the irq handler.
+        */
+       config &= ~perf_ibs->valid_mask;
+
+       perf_ibs_event_update(perf_ibs, event, &config);
+       hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+       if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+               return -ENOSPC;
+
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       pcpu->event = event;
+
+       if (flags & PERF_EF_START)
+               perf_ibs_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+       if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+               return;
+
+       perf_ibs_stop(event, PERF_EF_UPDATE);
+
+       pcpu->event = NULL;
+
+       perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en,       "config:57");
+PMU_FORMAT_ATTR(cnt_ctl,       "config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+       &format_attr_rand_en.attr,
+       NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+       NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+       NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+       .pmu = {
+               .task_ctx_nr    = perf_invalid_context,
+
+               .event_init     = perf_ibs_init,
+               .add            = perf_ibs_add,
+               .del            = perf_ibs_del,
+               .start          = perf_ibs_start,
+               .stop           = perf_ibs_stop,
+               .read           = perf_ibs_read,
+       },
+       .msr                    = MSR_AMD64_IBSFETCHCTL,
+       .config_mask            = IBS_FETCH_CONFIG_MASK,
+       .cnt_mask               = IBS_FETCH_MAX_CNT,
+       .enable_mask            = IBS_FETCH_ENABLE,
+       .valid_mask             = IBS_FETCH_VAL,
+       .max_period             = IBS_FETCH_MAX_CNT << 4,
+       .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
+       .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
+       .format_attrs           = ibs_fetch_format_attrs,
+
+       .get_count              = get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+       .pmu = {
+               .task_ctx_nr    = perf_invalid_context,
+
+               .event_init     = perf_ibs_init,
+               .add            = perf_ibs_add,
+               .del            = perf_ibs_del,
+               .start          = perf_ibs_start,
+               .stop           = perf_ibs_stop,
+               .read           = perf_ibs_read,
+       },
+       .msr                    = MSR_AMD64_IBSOPCTL,
+       .config_mask            = IBS_OP_CONFIG_MASK,
+       .cnt_mask               = IBS_OP_MAX_CNT,
+       .enable_mask            = IBS_OP_ENABLE,
+       .valid_mask             = IBS_OP_VAL,
+       .max_period             = IBS_OP_MAX_CNT << 4,
+       .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
+       .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
+       .format_attrs           = ibs_op_format_attrs,
+
+       .get_count              = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       struct perf_event *event = pcpu->event;
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_sample_data data;
+       struct perf_raw_record raw;
+       struct pt_regs regs;
+       struct perf_ibs_data ibs_data;
+       int offset, size, check_rip, offset_max, throttle = 0;
+       unsigned int msr;
+       u64 *buf, *config, period;
+
+       if (!test_bit(IBS_STARTED, pcpu->state)) {
+               /*
+                * Catch spurious interrupts after stopping IBS: After
+                * disabling IBS there could be still incoming NMIs
+                * with samples that even have the valid bit cleared.
+                * Mark all this NMIs as handled.
+                */
+               return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+       }
+
+       msr = hwc->config_base;
+       buf = ibs_data.regs;
+       rdmsrl(msr, *buf);
+       if (!(*buf++ & perf_ibs->valid_mask))
+               return 0;
+
+       config = &ibs_data.regs[0];
+       perf_ibs_event_update(perf_ibs, event, config);
+       perf_sample_data_init(&data, 0, hwc->last_period);
+       if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+               goto out;       /* no sw counter overflow */
+
+       ibs_data.caps = ibs_caps;
+       size = 1;
+       offset = 1;
+       check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+       if (event->attr.sample_type & PERF_SAMPLE_RAW)
+               offset_max = perf_ibs->offset_max;
+       else if (check_rip)
+               offset_max = 2;
+       else
+               offset_max = 1;
+       do {
+               rdmsrl(msr + offset, *buf++);
+               size++;
+               offset = find_next_bit(perf_ibs->offset_mask,
+                                      perf_ibs->offset_max,
+                                      offset + 1);
+       } while (offset < offset_max);
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               /*
+                * Read IbsBrTarget and IbsOpData4 separately
+                * depending on their availability.
+                * Can't add to offset_max as they are staggered
+                */
+               if (ibs_caps & IBS_CAPS_BRNTRGT) {
+                       rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+                       size++;
+               }
+               if (ibs_caps & IBS_CAPS_OPDATA4) {
+                       rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+                       size++;
+               }
+       }
+       ibs_data.size = sizeof(u64) * size;
+
+       regs = *iregs;
+       if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+               regs.flags &= ~PERF_EFLAGS_EXACT;
+       } else {
+               set_linear_ip(&regs, ibs_data.regs[1]);
+               regs.flags |= PERF_EFLAGS_EXACT;
+       }
+
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               raw.size = sizeof(u32) + ibs_data.size;
+               raw.data = ibs_data.data;
+               data.raw = &raw;
+       }
+
+       throttle = perf_event_overflow(event, &data, &regs);
+out:
+       if (throttle)
+               perf_ibs_disable_event(perf_ibs, hwc, *config);
+       else
+               perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+       perf_event_update_userpage(event);
+
+       return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+       int handled = 0;
+
+       handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+       handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+       struct cpu_perf_ibs __percpu *pcpu;
+       int ret;
+
+       pcpu = alloc_percpu(struct cpu_perf_ibs);
+       if (!pcpu)
+               return -ENOMEM;
+
+       perf_ibs->pcpu = pcpu;
+
+       /* register attributes */
+       if (perf_ibs->format_attrs[0]) {
+               memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+               perf_ibs->format_group.name     = "format";
+               perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
+
+               memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+               perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
+               perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
+       }
+
+       ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+       if (ret) {
+               perf_ibs->pcpu = NULL;
+               free_percpu(pcpu);
+       }
+
+       return ret;
+}
+
+static __init int perf_event_ibs_init(void)
+{
+       struct attribute **attr = ibs_op_format_attrs;
+
+       if (!ibs_caps)
+               return -ENODEV; /* ibs not supported by the cpu */
+
+       perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+       if (ibs_caps & IBS_CAPS_OPCNT) {
+               perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+               *attr++ = &format_attr_cnt_ctl.attr;
+       }
+       perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+       register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+       pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+       return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+       u32 caps;
+       unsigned int max_level;
+
+       if (!boot_cpu_has(X86_FEATURE_IBS))
+               return 0;
+
+       /* check IBS cpuid feature flags */
+       max_level = cpuid_eax(0x80000000);
+       if (max_level < IBS_CPUID_FEATURES)
+               return IBS_CAPS_DEFAULT;
+
+       caps = cpuid_eax(IBS_CPUID_FEATURES);
+       if (!(caps & IBS_CAPS_AVAIL))
+               /* cpuid flags not valid */
+               return IBS_CAPS_DEFAULT;
+
+       return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+       return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+       return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+       int offset;
+       u64 val;
+       int valid = 0;
+
+       preempt_disable();
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+               pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+               goto out;
+       }
+
+       if (!get_eilvt(offset)) {
+               pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+               goto out;
+       }
+
+       valid = 1;
+out:
+       preempt_enable();
+
+       return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+       struct pci_dev *cpu_cfg;
+       int nodes;
+       u32 value = 0;
+
+       nodes = 0;
+       cpu_cfg = NULL;
+       do {
+               cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+                                        PCI_DEVICE_ID_AMD_10H_NB_MISC,
+                                        cpu_cfg);
+               if (!cpu_cfg)
+                       break;
+               ++nodes;
+               pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+                                      | IBSCTL_LVT_OFFSET_VALID);
+               pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+                       pci_dev_put(cpu_cfg);
+                       pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+                                value);
+                       return -EINVAL;
+               }
+       } while (1);
+
+       if (!nodes) {
+               pr_debug("No CPU node configured for IBS\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+       int offset;
+       int ret;
+
+       preempt_disable();
+       /* find the next free available EILVT entry, skip offset 0 */
+       for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+               if (get_eilvt(offset))
+                       break;
+       }
+       preempt_enable();
+
+       if (offset == APIC_EILVT_NR_MAX) {
+               pr_debug("No EILVT entry available\n");
+               return;
+       }
+
+       ret = setup_ibs_ctl(offset);
+       if (ret)
+               goto out;
+
+       if (!ibs_eilvt_valid())
+               goto out;
+
+       pr_info("IBS: LVT offset %d assigned\n", offset);
+
+       return;
+out:
+       preempt_disable();
+       put_eilvt(offset);
+       preempt_enable();
+       return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+       /*
+        * Force LVT offset assignment for family 10h: The offsets are
+        * not assigned by the BIOS for this family, so the OS is
+        * responsible for doing it. If the OS assignment fails, fall
+        * back to BIOS settings and try to setup this.
+        */
+       if (boot_cpu_data.x86 == 0x10)
+               force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       if (!(val & IBSCTL_LVT_OFFSET_VALID))
+               return -EINVAL;
+
+       return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+       int offset;
+
+       offset = get_ibs_lvt_offset();
+       if (offset < 0)
+               goto failed;
+
+       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+               return;
+failed:
+       pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+               smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+       int offset;
+
+       offset = get_ibs_lvt_offset();
+       if (offset >= 0)
+               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+       clear_APIC_ibs(NULL);
+       return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+       ibs_eilvt_setup();
+       setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+       .resume         = perf_ibs_resume,
+       .suspend        = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+       register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_STARTING:
+               setup_APIC_ibs(NULL);
+               break;
+       case CPU_DYING:
+               clear_APIC_ibs(NULL);
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+       u32 caps;
+       int ret = -EINVAL;
+
+       caps = __get_ibs_caps();
+       if (!caps)
+               return -ENODEV; /* ibs not supported by the cpu */
+
+       ibs_eilvt_setup();
+
+       if (!ibs_eilvt_valid())
+               goto out;
+
+       perf_ibs_pm_init();
+       cpu_notifier_register_begin();
+       ibs_caps = caps;
+       /* make ibs_caps visible to other cpus: */
+       smp_mb();
+       smp_call_function(setup_APIC_ibs, NULL, 1);
+       __perf_cpu_notifier(perf_ibs_cpu_notifier);
+       cpu_notifier_register_done();
+
+       ret = perf_event_ibs_init();
+out:
+       if (ret)
+               pr_err("Failed to setup IBS, %d\n", ret);
+       return ret;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c

new file mode 100644 (file)

index 0000000..635e5eb
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT          16
+
+#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
+#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+       struct pmu pmu;
+       u8 max_banks;
+       u8 max_counters;
+       u64 cntr_assign_mask;
+       raw_spinlock_t lock;
+       const struct attribute_group *attr_groups[4];
+};
+
+#define format_group   attr_groups[0]
+#define cpumask_group  attr_groups[1]
+#define events_group   attr_groups[2]
+#define null_group     attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource,    "config:0-7");
+PMU_FORMAT_ATTR(devid,      "config:8-23");
+PMU_FORMAT_ATTR(pasid,      "config:24-39");
+PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+       &format_attr_csource.attr,
+       &format_attr_devid.attr,
+       &format_attr_pasid.attr,
+       &format_attr_domid.attr,
+       &format_attr_devid_mask.attr,
+       &format_attr_pasid_mask.attr,
+       &format_attr_domid_mask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+       .name = "format",
+       .attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+       struct kobj_attribute attr;
+       const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+       struct amd_iommu_event_desc *event =
+               container_of(attr, struct amd_iommu_event_desc, attr);
+       return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event)                    \
+{                                                              \
+       .attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),  \
+       .event = _event,                                        \
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+       AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
+       AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
+       AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
+       AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
+       AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
+       AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
+       AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
+       AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
+       AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
+       AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
+       AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
+       AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+       { /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+                                  struct device_attribute *attr,
+                                  char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+       .attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+       unsigned long flags;
+       int shift, bank, cntr, retval;
+       int max_banks = perf_iommu->max_banks;
+       int max_cntrs = perf_iommu->max_counters;
+
+       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+       for (bank = 0, shift = 0; bank < max_banks; bank++) {
+               for (cntr = 0; cntr < max_cntrs; cntr++) {
+                       shift = bank + (bank*3) + cntr;
+                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+                               continue;
+                       } else {
+                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+                               goto out;
+                       }
+               }
+       }
+       retval = -ENOSPC;
+out:
+       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+       return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+                                       u8 bank, u8 cntr)
+{
+       unsigned long flags;
+       int max_banks, max_cntrs;
+       int shift = 0;
+
+       max_banks = perf_iommu->max_banks;
+       max_cntrs = perf_iommu->max_counters;
+
+       if ((bank > max_banks) || (cntr > max_cntrs))
+               return -EINVAL;
+
+       shift = bank + cntr + (bank*3);
+
+       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+       perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+       return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_amd_iommu *perf_iommu;
+       u64 config, config1;
+
+       /* test the event attr type check for PMU enumeration */
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /*
+        * IOMMU counters are shared across all cores.
+        * Therefore, it does not support per-process mode.
+        * Also, it does not support event sampling mode.
+        */
+       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       /* IOMMU counters do not have usr/os/guest/host bits */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+           event->attr.exclude_host || event->attr.exclude_guest)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       perf_iommu = &__perf_iommu;
+
+       if (event->pmu != &perf_iommu->pmu)
+               return -ENOENT;
+
+       if (perf_iommu) {
+               config = event->attr.config;
+               config1 = event->attr.config1;
+       } else {
+               return -EINVAL;
+       }
+
+       /* integrate with iommu base devid (0000), assume one iommu */
+       perf_iommu->max_banks =
+               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+       perf_iommu->max_counters =
+               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+               return -EINVAL;
+
+       /* update the hw_perf_event struct with the iommu config data */
+       hwc->config = config;
+       hwc->extra_reg.config = config1;
+
+       return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+       u8 csource = _GET_CSOURCE(ev);
+       u16 devid = _GET_DEVID(ev);
+       u64 reg = 0ULL;
+
+       reg = csource;
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+       u64 reg = 0ULL;
+
+       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                       _GET_BANK(event), _GET_CNTR(event),
+                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       pr_debug("perf: amd_iommu:perf_iommu_start\n");
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+       hwc->state = 0;
+
+       if (flags & PERF_EF_RELOAD) {
+               u64 prev_raw_count =  local64_read(&hwc->prev_count);
+               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                               _GET_BANK(event), _GET_CNTR(event),
+                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+       }
+
+       perf_iommu_enable_event(event);
+       perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+       u64 count = 0ULL;
+       u64 prev_raw_count = 0ULL;
+       u64 delta = 0ULL;
+       struct hw_perf_event *hwc = &event->hw;
+       pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                               _GET_BANK(event), _GET_CNTR(event),
+                               IOMMU_PC_COUNTER_REG, &count, false);
+
+       /* IOMMU pc counter register is only 48 bits */
+       count &= 0xFFFFFFFFFFFFULL;
+
+       prev_raw_count =  local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       count) != prev_raw_count)
+               return;
+
+       /* Handling 48-bit counter overflowing */
+       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+       delta >>= COUNTER_SHIFT;
+       local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+
+       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       perf_iommu_disable_event(event);
+       WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+       hwc->state |= PERF_HES_STOPPED;
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       config = hwc->config;
+       perf_iommu_read(event);
+       hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+       int retval;
+       struct perf_amd_iommu *perf_iommu =
+                       container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+       pr_debug("perf: amd_iommu:perf_iommu_add\n");
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       /* request an iommu bank/counter */
+       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+       if (retval != -ENOSPC)
+               event->hw.extra_reg.reg = (u16)retval;
+       else
+               return retval;
+
+       if (flags & PERF_EF_START)
+               perf_iommu_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+       struct perf_amd_iommu *perf_iommu =
+                       container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+       pr_debug("perf: amd_iommu:perf_iommu_del\n");
+       perf_iommu_stop(event, PERF_EF_UPDATE);
+
+       /* clear the assigned iommu bank/counter */
+       clear_avail_iommu_bnk_cntr(perf_iommu,
+                                    _GET_BANK(event),
+                                    _GET_CNTR(event));
+
+       perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+       struct attribute **attrs;
+       struct attribute_group *attr_group;
+       int i = 0, j;
+
+       while (amd_iommu_v2_event_descs[i].attr.attr.name)
+               i++;
+
+       attr_group = kzalloc(sizeof(struct attribute *)
+               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+       if (!attr_group)
+               return -ENOMEM;
+
+       attrs = (struct attribute **)(attr_group + 1);
+       for (j = 0; j < i; j++)
+               attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+       attr_group->name = "events";
+       attr_group->attrs = attrs;
+       perf_iommu->events_group = attr_group;
+
+       return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+       if (__perf_iommu.events_group != NULL) {
+               kfree(__perf_iommu.events_group);
+               __perf_iommu.events_group = NULL;
+       }
+}
+
+static __init int _init_perf_amd_iommu(
+       struct perf_amd_iommu *perf_iommu, char *name)
+{
+       int ret;
+
+       raw_spin_lock_init(&perf_iommu->lock);
+
+       /* Init format attributes */
+       perf_iommu->format_group = &amd_iommu_format_group;
+
+       /* Init cpumask attributes to only core 0 */
+       cpumask_set_cpu(0, &iommu_cpumask);
+       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+       /* Init events attributes */
+       if (_init_events_attrs(perf_iommu) != 0)
+               pr_err("perf: amd_iommu: Only support raw events.\n");
+
+       /* Init null attributes */
+       perf_iommu->null_group = NULL;
+       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+       if (ret) {
+               pr_err("perf: amd_iommu: Failed to initialized.\n");
+               amd_iommu_pc_exit();
+       } else {
+               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+       }
+
+       return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+       .pmu = {
+               .event_init     = perf_iommu_event_init,
+               .add            = perf_iommu_add,
+               .del            = perf_iommu_del,
+               .start          = perf_iommu_start,
+               .stop           = perf_iommu_stop,
+               .read           = perf_iommu_read,
+       },
+       .max_banks              = 0x00,
+       .max_counters           = 0x00,
+       .cntr_assign_mask       = 0ULL,
+       .format_group           = NULL,
+       .cpumask_group          = NULL,
+       .events_group           = NULL,
+       .null_group             = NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+       /* Make sure the IOMMU PC resource is available */
+       if (!amd_iommu_pc_supported())
+               return -ENODEV;
+
+       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+       return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h

new file mode 100644 (file)

index 0000000..845d173
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG                   0x00
+#define IOMMU_PC_COUNTER_SRC_REG               0x08
+#define IOMMU_PC_PASID_MATCH_REG               0x10
+#define IOMMU_PC_DOMID_MATCH_REG               0x18
+#define IOMMU_PC_DEVID_MATCH_REG               0x20
+#define IOMMU_PC_COUNTER_REPORT_REG            0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS                       64
+#define PC_MAX_SPEC_CNTRS                      16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID                       0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+                       u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c

new file mode 100644 (file)

index 0000000..3db9569
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB                4
+#define NUM_COUNTERS_L2                4
+#define MAX_COUNTERS           NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB          6
+#define RDPMC_BASE_L2          10
+
+#define COUNTER_SHIFT          16
+
+struct amd_uncore {
+       int id;
+       int refcnt;
+       int cpu;
+       int num_counters;
+       int rdpmc_base;
+       u32 msr_base;
+       cpumask_t *active_mask;
+       struct pmu *pmu;
+       struct perf_event *events[MAX_COUNTERS];
+       struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+       return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+       return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+       if (is_nb_event(event) && amd_uncore_nb)
+               return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+       else if (is_l2_event(event) && amd_uncore_l2)
+               return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+       return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev, new;
+       s64 delta;
+
+       /*
+        * since we do not enable counter overflow interrupts,
+        * we do not have to worry about prev_count changing on us
+        */
+
+       prev = local64_read(&hwc->prev_count);
+       rdpmcl(hwc->event_base_rdpmc, new);
+       local64_set(&hwc->prev_count, new);
+       delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+       delta >>= COUNTER_SHIFT;
+       local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (flags & PERF_EF_RELOAD)
+               wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+       hwc->state = 0;
+       wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+       perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+       hwc->state |= PERF_HES_STOPPED;
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               amd_uncore_read(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+       int i;
+       struct amd_uncore *uncore = event_to_amd_uncore(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       /* are we already assigned? */
+       if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+               goto out;
+
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (uncore->events[i] == event) {
+                       hwc->idx = i;
+                       goto out;
+               }
+       }
+
+       /* if not, take the first available counter */
+       hwc->idx = -1;
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+                       hwc->idx = i;
+                       break;
+               }
+       }
+
+out:
+       if (hwc->idx == -1)
+               return -EBUSY;
+
+       hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+       hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+       hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       if (flags & PERF_EF_START)
+               amd_uncore_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+       int i;
+       struct amd_uncore *uncore = event_to_amd_uncore(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       amd_uncore_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (cmpxchg(&uncore->events[i], event, NULL) == event)
+                       break;
+       }
+
+       hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+       struct amd_uncore *uncore;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /*
+        * NB and L2 counters (MSRs) are shared across all cores that share the
+        * same NB / L2 cache. Interrupts can be directed to a single target
+        * core, however, event counts generated by processes running on other
+        * cores cannot be masked out. So we do not support sampling and
+        * per-thread events.
+        */
+       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       /* NB and L2 counters do not have usr/os/guest/host bits */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+           event->attr.exclude_host || event->attr.exclude_guest)
+               return -EINVAL;
+
+       /* and we do not enable counter overflow interrupts */
+       hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+       hwc->idx = -1;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       uncore = event_to_amd_uncore(event);
+       if (!uncore)
+               return -ENODEV;
+
+       /*
+        * since request can come in to any of the shared cores, we will remap
+        * to a single common cpu.
+        */
+       event->cpu = uncore->cpu;
+
+       return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
+{
+       cpumask_t *active_mask;
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       if (pmu->type == amd_nb_pmu.type)
+               active_mask = &amd_nb_active_mask;
+       else if (pmu->type == amd_l2_pmu.type)
+               active_mask = &amd_l2_active_mask;
+       else
+               return 0;
+
+       return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+       .attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+       .name = "format",
+       .attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+       &amd_uncore_attr_group,
+       &amd_uncore_format_group,
+       NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+       .attr_groups    = amd_uncore_attr_groups,
+       .name           = "amd_nb",
+       .event_init     = amd_uncore_event_init,
+       .add            = amd_uncore_add,
+       .del            = amd_uncore_del,
+       .start          = amd_uncore_start,
+       .stop           = amd_uncore_stop,
+       .read           = amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+       .attr_groups    = amd_uncore_attr_groups,
+       .name           = "amd_l2",
+       .event_init     = amd_uncore_event_init,
+       .add            = amd_uncore_add,
+       .del            = amd_uncore_del,
+       .start          = amd_uncore_start,
+       .stop           = amd_uncore_stop,
+       .read           = amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+       return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+                       cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+       struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+
+       if (amd_uncore_nb) {
+               uncore_nb = amd_uncore_alloc(cpu);
+               if (!uncore_nb)
+                       goto fail;
+               uncore_nb->cpu = cpu;
+               uncore_nb->num_counters = NUM_COUNTERS_NB;
+               uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+               uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+               uncore_nb->active_mask = &amd_nb_active_mask;
+               uncore_nb->pmu = &amd_nb_pmu;
+               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+       }
+
+       if (amd_uncore_l2) {
+               uncore_l2 = amd_uncore_alloc(cpu);
+               if (!uncore_l2)
+                       goto fail;
+               uncore_l2->cpu = cpu;
+               uncore_l2->num_counters = NUM_COUNTERS_L2;
+               uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+               uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+               uncore_l2->active_mask = &amd_l2_active_mask;
+               uncore_l2->pmu = &amd_l2_pmu;
+               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+       }
+
+       return 0;
+
+fail:
+       if (amd_uncore_nb)
+               *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+       kfree(uncore_nb);
+       return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+                              struct amd_uncore * __percpu *uncores)
+{
+       unsigned int cpu;
+       struct amd_uncore *that;
+
+       for_each_online_cpu(cpu) {
+               that = *per_cpu_ptr(uncores, cpu);
+
+               if (!that)
+                       continue;
+
+               if (this == that)
+                       continue;
+
+               if (this->id == that->id) {
+                       that->free_when_cpu_online = this;
+                       this = that;
+                       break;
+               }
+       }
+
+       this->refcnt++;
+       return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+       unsigned int eax, ebx, ecx, edx;
+       struct amd_uncore *uncore;
+
+       if (amd_uncore_nb) {
+               uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+               uncore->id = ecx & 0xff;
+
+               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+       }
+
+       if (amd_uncore_l2) {
+               unsigned int apicid = cpu_data(cpu).apicid;
+               unsigned int nshared;
+
+               uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+               cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+               nshared = ((eax >> 14) & 0xfff) + 1;
+               uncore->id = apicid - (apicid % nshared);
+
+               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+       }
+}
+
+static void uncore_online(unsigned int cpu,
+                         struct amd_uncore * __percpu *uncores)
+{
+       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+       kfree(uncore->free_when_cpu_online);
+       uncore->free_when_cpu_online = NULL;
+
+       if (cpu == uncore->cpu)
+               cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_online(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+                               struct amd_uncore * __percpu *uncores)
+{
+       unsigned int i;
+       struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+       if (this->cpu != cpu)
+               return;
+
+       /* this cpu is going down, migrate to a shared sibling if possible */
+       for_each_online_cpu(i) {
+               struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+               if (cpu == i)
+                       continue;
+
+               if (this == that) {
+                       perf_pmu_migrate_context(this->pmu, cpu, i);
+                       cpumask_clear_cpu(cpu, that->active_mask);
+                       cpumask_set_cpu(i, that->active_mask);
+                       that->cpu = i;
+                       break;
+               }
+       }
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_down_prepare(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+       if (cpu == uncore->cpu)
+               cpumask_clear_cpu(cpu, uncore->active_mask);
+
+       if (!--uncore->refcnt)
+               kfree(uncore);
+       *per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_dead(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               if (amd_uncore_cpu_up_prepare(cpu))
+                       return notifier_from_errno(-ENOMEM);
+               break;
+
+       case CPU_STARTING:
+               amd_uncore_cpu_starting(cpu);
+               break;
+
+       case CPU_ONLINE:
+               amd_uncore_cpu_online(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               amd_uncore_cpu_down_prepare(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               amd_uncore_cpu_dead(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+       .notifier_call  = amd_uncore_cpu_notifier,
+       .priority       = CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+       unsigned int cpu = smp_processor_id();
+
+       amd_uncore_cpu_starting(cpu);
+       amd_uncore_cpu_online(cpu);
+}
+
+static void cleanup_cpu_online(void *dummy)
+{
+       unsigned int cpu = smp_processor_id();
+
+       amd_uncore_cpu_dead(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+       unsigned int cpu, cpu2;
+       int ret = -ENODEV;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               goto fail_nodev;
+
+       if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+               goto fail_nodev;
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+               amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+               if (!amd_uncore_nb) {
+                       ret = -ENOMEM;
+                       goto fail_nb;
+               }
+               ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+               if (ret)
+                       goto fail_nb;
+
+               pr_info("perf: AMD NB counters detected\n");
+               ret = 0;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+               amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+               if (!amd_uncore_l2) {
+                       ret = -ENOMEM;
+                       goto fail_l2;
+               }
+               ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+               if (ret)
+                       goto fail_l2;
+
+               pr_info("perf: AMD L2I counters detected\n");
+               ret = 0;
+       }
+
+       if (ret)
+               goto fail_nodev;
+
+       cpu_notifier_register_begin();
+
+       /* init cpus already online before registering for hotplug notifier */
+       for_each_online_cpu(cpu) {
+               ret = amd_uncore_cpu_up_prepare(cpu);
+               if (ret)
+                       goto fail_online;
+               smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+       }
+
+       __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+       cpu_notifier_register_done();
+
+       return 0;
+
+
+fail_online:
+       for_each_online_cpu(cpu2) {
+               if (cpu2 == cpu)
+                       break;
+               smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+       }
+       cpu_notifier_register_done();
+
+       /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+       amd_uncore_nb = amd_uncore_l2 = NULL;
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
+               perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+               perf_pmu_unregister(&amd_nb_pmu);
+       if (amd_uncore_l2)
+               free_percpu(amd_uncore_l2);
+fail_nb:
+       if (amd_uncore_nb)
+               free_percpu(amd_uncore_nb);
+
+fail_nodev:
+       return ret;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c

new file mode 100644 (file)

index 0000000..5e830d0
--- /dev/null
+++ b/arch/x86/events/core.c
@@ -0,0 +1,2442 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+
+#include "perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+       .enabled = 1,
+};
+
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
+u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int shift = 64 - x86_pmu.cntval_bits;
+       u64 prev_raw_count, new_raw_count;
+       int idx = hwc->idx;
+       s64 delta;
+
+       if (idx == INTEL_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               goto again;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+       struct extra_reg *er;
+
+       reg = &event->hw.extra_reg;
+
+       if (!x86_pmu.extra_regs)
+               return 0;
+
+       for (er = x86_pmu.extra_regs; er->msr; er++) {
+               if (er->event != (config & er->config_mask))
+                       continue;
+               if (event->attr.config1 & ~er->valid_mask)
+                       return -EINVAL;
+               /* Check if the extra msrs can be safely accessed*/
+               if (!er->extra_msr_access)
+                       return -ENXIO;
+
+               reg->idx = er->idx;
+               reg->config = event->attr.config1;
+               reg->reg = er->msr;
+               break;
+       }
+       return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool reserve_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+                       goto perfctr_fail;
+       }
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+                       goto eventsel_fail;
+       }
+
+       return true;
+
+eventsel_fail:
+       for (i--; i >= 0; i--)
+               release_evntsel_nmi(x86_pmu_config_addr(i));
+
+       i = x86_pmu.num_counters;
+
+perfctr_fail:
+       for (i--; i >= 0; i--)
+               release_perfctr_nmi(x86_pmu_event_addr(i));
+
+       return false;
+}
+
+static void release_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               release_perfctr_nmi(x86_pmu_event_addr(i));
+               release_evntsel_nmi(x86_pmu_config_addr(i));
+       }
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+static bool check_hw_exists(void)
+{
+       u64 val, val_fail, val_new= ~0;
+       int i, reg, reg_fail, ret = 0;
+       int bios_fail = 0;
+       int reg_safe = -1;
+
+       /*
+        * Check to see if the BIOS enabled any of the counters, if so
+        * complain and bail.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu_config_addr(i);
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+                       bios_fail = 1;
+                       val_fail = val;
+                       reg_fail = reg;
+               } else {
+                       reg_safe = i;
+               }
+       }
+
+       if (x86_pmu.num_counters_fixed) {
+               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+                       if (val & (0x03 << i*4)) {
+                               bios_fail = 1;
+                               val_fail = val;
+                               reg_fail = reg;
+                       }
+               }
+       }
+
+       /*
+        * If all the counters are enabled, the below test will always
+        * fail.  The tools will also become useless in this scenario.
+        * Just fail and disable the hardware counters.
+        */
+
+       if (reg_safe == -1) {
+               reg = reg_safe;
+               goto msr_fail;
+       }
+
+       /*
+        * Read the current value, change it and read it back to see if it
+        * matches, this is needed to detect certain hardware emulators
+        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+        */
+       reg = x86_pmu_event_addr(reg_safe);
+       if (rdmsrl_safe(reg, &val))
+               goto msr_fail;
+       val ^= 0xffffUL;
+       ret = wrmsrl_safe(reg, val);
+       ret |= rdmsrl_safe(reg, &val_new);
+       if (ret || val != val_new)
+               goto msr_fail;
+
+       /*
+        * We still allow the PMU driver to operate:
+        */
+       if (bios_fail) {
+               pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+               pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+                             reg_fail, val_fail);
+       }
+
+       return true;
+
+msr_fail:
+       pr_cont("Broken PMU hardware detected, using software events only.\n");
+       pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+               boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
+               reg, val_new);
+
+       return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+       x86_release_hardware();
+       atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+       hw_perf_event_destroy(event);
+
+       /* undo the lbr/bts event accounting */
+       x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+       return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       unsigned int cache_type, cache_op, cache_result;
+       u64 config, val;
+
+       config = attr->config;
+
+       cache_type = (config >>  0) & 0xff;
+       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+               return -EINVAL;
+
+       cache_op = (config >>  8) & 0xff;
+       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+               return -EINVAL;
+
+       cache_result = (config >> 16) & 0xff;
+       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+       if (val == 0)
+               return -ENOENT;
+
+       if (val == -1)
+               return -EINVAL;
+
+       hwc->config |= val;
+       attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+       return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+       int err = 0;
+
+       if (!atomic_inc_not_zero(&pmc_refcount)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&pmc_refcount) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               reserve_ds_buffers();
+               }
+               if (!err)
+                       atomic_inc(&pmc_refcount);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       return err;
+}
+
+void x86_release_hardware(void)
+{
+       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+               release_pmc_hardware();
+               release_ds_buffers();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+       int i;
+
+       if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+               mutex_lock(&pmc_reserve_mutex);
+               for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+                       if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+                               goto fail_unlock;
+               }
+               atomic_inc(&x86_pmu.lbr_exclusive[what]);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       atomic_inc(&active_events);
+       return 0;
+
+fail_unlock:
+       mutex_unlock(&pmc_reserve_mutex);
+       return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+       atomic_dec(&x86_pmu.lbr_exclusive[what]);
+       atomic_dec(&active_events);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+
+       if (!is_sampling_event(event)) {
+               hwc->sample_period = x86_pmu.max_period;
+               hwc->last_period = hwc->sample_period;
+               local64_set(&hwc->period_left, hwc->sample_period);
+       }
+
+       if (attr->type == PERF_TYPE_RAW)
+               return x86_pmu_extra_regs(event->attr.config, event);
+
+       if (attr->type == PERF_TYPE_HW_CACHE)
+               return set_ext_hw_attr(hwc, event);
+
+       if (attr->config >= x86_pmu.max_events)
+               return -EINVAL;
+
+       /*
+        * The generic map:
+        */
+       config = x86_pmu.event_map(attr->config);
+
+       if (config == 0)
+               return -ENOENT;
+
+       if (config == -1LL)
+               return -EINVAL;
+
+       /*
+        * Branch tracing:
+        */
+       if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+           !attr->freq && hwc->sample_period == 1) {
+               /* BTS is not supported by this architecture. */
+               if (!x86_pmu.bts_active)
+                       return -EOPNOTSUPP;
+
+               /* BTS is currently only allowed for user-mode. */
+               if (!attr->exclude_kernel)
+                       return -EOPNOTSUPP;
+
+               /* disallow bts if conflicting events are present */
+               if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                       return -EBUSY;
+
+               event->destroy = hw_perf_lbr_event_destroy;
+       }
+
+       hwc->config |= config;
+
+       return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+       u64 m = event->attr.branch_sample_type;
+       u64 b = 0;
+
+       /* must capture all branches */
+       if (!(m & PERF_SAMPLE_BRANCH_ANY))
+               return 0;
+
+       m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+       if (!event->attr.exclude_user)
+               b |= PERF_SAMPLE_BRANCH_USER;
+
+       if (!event->attr.exclude_kernel)
+               b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+       /*
+        * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+        */
+
+       return m == b;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+       if (event->attr.precise_ip) {
+               int precise = 0;
+
+               /* Support for constant skid */
+               if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+                       precise++;
+
+                       /* Support for IP fixup */
+                       if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+                               precise++;
+
+                       if (x86_pmu.pebs_prec_dist)
+                               precise++;
+               }
+
+               if (event->attr.precise_ip > precise)
+                       return -EOPNOTSUPP;
+       }
+       /*
+        * check that PEBS LBR correction does not conflict with
+        * whatever the user is asking with attr->branch_sample_type
+        */
+       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+               u64 *br_type = &event->attr.branch_sample_type;
+
+               if (has_branch_stack(event)) {
+                       if (!precise_br_compat(event))
+                               return -EOPNOTSUPP;
+
+                       /* branch_sample_type is compatible */
+
+               } else {
+                       /*
+                        * user did not specify  branch_sample_type
+                        *
+                        * For PEBS fixups, we capture all
+                        * the branches at the priv level of the
+                        * event.
+                        */
+                       *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+                       if (!event->attr.exclude_user)
+                               *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+                       if (!event->attr.exclude_kernel)
+                               *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+               }
+       }
+
+       if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+               event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+       /*
+        * Generate PMC IRQs:
+        * (keep 'enabled' bit clear for now)
+        */
+       event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+       /*
+        * Count user and OS events unless requested not to
+        */
+       if (!event->attr.exclude_user)
+               event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+       if (!event->attr.exclude_kernel)
+               event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+       if (event->attr.type == PERF_TYPE_RAW)
+               event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+       if (event->attr.sample_period && x86_pmu.limit_period) {
+               if (x86_pmu.limit_period(event, event->attr.sample_period) >
+                               event->attr.sample_period)
+                       return -EINVAL;
+       }
+
+       return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+       int err;
+
+       if (!x86_pmu_initialized())
+               return -ENODEV;
+
+       err = x86_reserve_hardware();
+       if (err)
+               return err;
+
+       atomic_inc(&active_events);
+       event->destroy = hw_perf_event_destroy;
+
+       event->hw.idx = -1;
+       event->hw.last_cpu = -1;
+       event->hw.last_tag = ~0ULL;
+
+       /* mark unused */
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+       return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               rdmsrl(x86_pmu_config_addr(idx), val);
+               if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+                       continue;
+               val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+               wrmsrl(x86_pmu_config_addr(idx), val);
+       }
+}
+
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
+static void x86_pmu_disable(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (!x86_pmu_initialized())
+               return;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->n_added = 0;
+       cpuc->enabled = 0;
+       barrier();
+
+       x86_pmu.disable_all();
+}
+
+void x86_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+       }
+}
+
+static struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+       return event->pmu == &pmu;
+}
+
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+       int     weight;
+       int     event;          /* event index */
+       int     counter;        /* counter index */
+       int     unassigned;     /* number of events to be assigned left */
+       int     nr_gp;          /* number of GP counters used */
+       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define        SCHED_STATES_MAX        2
+
+struct perf_sched {
+       int                     max_weight;
+       int                     max_events;
+       int                     max_gp;
+       int                     saved_states;
+       struct event_constraint **constraints;
+       struct sched_state      state;
+       struct sched_state      saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+                           int num, int wmin, int wmax, int gpmax)
+{
+       int idx;
+
+       memset(sched, 0, sizeof(*sched));
+       sched->max_events       = num;
+       sched->max_weight       = wmax;
+       sched->max_gp           = gpmax;
+       sched->constraints      = constraints;
+
+       for (idx = 0; idx < num; idx++) {
+               if (constraints[idx]->weight == wmin)
+                       break;
+       }
+
+       sched->state.event      = idx;          /* start with min weight */
+       sched->state.weight     = wmin;
+       sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+               return;
+
+       sched->saved[sched->saved_states] = sched->state;
+       sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+       if (!sched->saved_states)
+               return false;
+
+       sched->saved_states--;
+       sched->state = sched->saved[sched->saved_states];
+
+       /* continue with next counter: */
+       clear_bit(sched->state.counter++, sched->state.used);
+
+       return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+       int idx;
+
+       if (!sched->state.unassigned)
+               return false;
+
+       if (sched->state.event >= sched->max_events)
+               return false;
+
+       c = sched->constraints[sched->state.event];
+       /* Prefer fixed purpose counters */
+       if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+               idx = INTEL_PMC_IDX_FIXED;
+               for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+                       if (!__test_and_set_bit(idx, sched->state.used))
+                               goto done;
+               }
+       }
+
+       /* Grab the first unused counter starting with idx */
+       idx = sched->state.counter;
+       for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+               if (!__test_and_set_bit(idx, sched->state.used)) {
+                       if (sched->state.nr_gp++ >= sched->max_gp)
+                               return false;
+
+                       goto done;
+               }
+       }
+
+       return false;
+
+done:
+       sched->state.counter = idx;
+
+       if (c->overlap)
+               perf_sched_save_state(sched);
+
+       return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+       while (!__perf_sched_find_counter(sched)) {
+               if (!perf_sched_restore_state(sched))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+
+       if (!sched->state.unassigned || !--sched->state.unassigned)
+               return false;
+
+       do {
+               /* next event */
+               sched->state.event++;
+               if (sched->state.event >= sched->max_events) {
+                       /* next weight */
+                       sched->state.event = 0;
+                       sched->state.weight++;
+                       if (sched->state.weight > sched->max_weight)
+                               return false;
+               }
+               c = sched->constraints[sched->state.event];
+       } while (c->weight != sched->state.weight);
+
+       sched->state.counter = 0;       /* start with first counter */
+
+       return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign)
+{
+       struct perf_sched sched;
+
+       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+       do {
+               if (!perf_sched_find_counter(&sched))
+                       break;  /* failed */
+               if (assign)
+                       assign[sched.state.event] = sched.state.counter;
+       } while (perf_sched_next_event(&sched));
+
+       return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+       struct event_constraint *c;
+       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       struct perf_event *e;
+       int i, wmin, wmax, unsched = 0;
+       struct hw_perf_event *hwc;
+
+       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+       if (x86_pmu.start_scheduling)
+               x86_pmu.start_scheduling(cpuc);
+
+       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+               cpuc->event_constraint[i] = NULL;
+               c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+               cpuc->event_constraint[i] = c;
+
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
+       }
+
+       /*
+        * fastpath, try to reuse previous register
+        */
+       for (i = 0; i < n; i++) {
+               hwc = &cpuc->event_list[i]->hw;
+               c = cpuc->event_constraint[i];
+
+               /* never assigned */
+               if (hwc->idx == -1)
+                       break;
+
+               /* constraint still honored */
+               if (!test_bit(hwc->idx, c->idxmsk))
+                       break;
+
+               /* not already used */
+               if (test_bit(hwc->idx, used_mask))
+                       break;
+
+               __set_bit(hwc->idx, used_mask);
+               if (assign)
+                       assign[i] = hwc->idx;
+       }
+
+       /* slow path */
+       if (i != n) {
+               int gpmax = x86_pmu.num_counters;
+
+               /*
+                * Do not allow scheduling of more than half the available
+                * generic counters.
+                *
+                * This helps avoid counter starvation of sibling thread by
+                * ensuring at most half the counters cannot be in exclusive
+                * mode. There is no designated counters for the limits. Any
+                * N/2 counters can be used. This helps with events with
+                * specific counter constraints.
+                */
+               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+                       gpmax /= 2;
+
+               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+                                            wmax, gpmax, assign);
+       }
+
+       /*
+        * In case of success (unsched = 0), mark events as committed,
+        * so we do not put_constraint() in case new events are added
+        * and fail to be scheduled
+        *
+        * We invoke the lower level commit callback to lock the resource
+        *
+        * We do not need to do all of this in case we are called to
+        * validate an event group (assign == NULL)
+        */
+       if (!unsched && assign) {
+               for (i = 0; i < n; i++) {
+                       e = cpuc->event_list[i];
+                       e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+                       if (x86_pmu.commit_scheduling)
+                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+               }
+       } else {
+               for (i = 0; i < n; i++) {
+                       e = cpuc->event_list[i];
+                       /*
+                        * do not put_constraint() on comitted events,
+                        * because they are good to go
+                        */
+                       if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+                               continue;
+
+                       /*
+                        * release events that failed scheduling
+                        */
+                       if (x86_pmu.put_event_constraints)
+                               x86_pmu.put_event_constraints(cpuc, e);
+               }
+       }
+
+       if (x86_pmu.stop_scheduling)
+               x86_pmu.stop_scheduling(cpuc);
+
+       return unsched ? -EINVAL : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+
+       /* current number of events already accepted */
+       n = cpuc->n_events;
+
+       if (is_x86_event(leader)) {
+               if (n >= max_count)
+                       return -EINVAL;
+               cpuc->event_list[n] = leader;
+               n++;
+       }
+       if (!dogrp)
+               return n;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry) {
+               if (!is_x86_event(event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               cpuc->event_list[n] = event;
+               n++;
+       }
+       return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+                               struct cpu_hw_events *cpuc, int i)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = cpuc->assign[i];
+       hwc->last_cpu = smp_processor_id();
+       hwc->last_tag = ++cpuc->tags[i];
+
+       if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+               hwc->config_base = 0;
+               hwc->event_base = 0;
+       } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+               hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+       } else {
+               hwc->config_base = x86_pmu_config_addr(hwc->idx);
+               hwc->event_base  = x86_pmu_event_addr(hwc->idx);
+               hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+       }
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+                                       struct cpu_hw_events *cpuc,
+                                       int i)
+{
+       return hwc->idx == cpuc->assign[i] &&
+               hwc->last_cpu == smp_processor_id() &&
+               hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int i, added = cpuc->n_added;
+
+       if (!x86_pmu_initialized())
+               return;
+
+       if (cpuc->enabled)
+               return;
+
+       if (cpuc->n_added) {
+               int n_running = cpuc->n_events - cpuc->n_added;
+               /*
+                * apply assignment obtained either from
+                * hw_perf_group_sched_in() or x86_pmu_enable()
+                *
+                * step1: save events moving to new counters
+                */
+               for (i = 0; i < n_running; i++) {
+                       event = cpuc->event_list[i];
+                       hwc = &event->hw;
+
+                       /*
+                        * we can avoid reprogramming counter if:
+                        * - assigned same counter as last time
+                        * - running on same CPU as last time
+                        * - no other event has used the counter since
+                        */
+                       if (hwc->idx == -1 ||
+                           match_prev_assignment(hwc, cpuc, i))
+                               continue;
+
+                       /*
+                        * Ensure we don't accidentally enable a stopped
+                        * counter simply because we rescheduled.
+                        */
+                       if (hwc->state & PERF_HES_STOPPED)
+                               hwc->state |= PERF_HES_ARCH;
+
+                       x86_pmu_stop(event, PERF_EF_UPDATE);
+               }
+
+               /*
+                * step2: reprogram moved events into new counters
+                */
+               for (i = 0; i < cpuc->n_events; i++) {
+                       event = cpuc->event_list[i];
+                       hwc = &event->hw;
+
+                       if (!match_prev_assignment(hwc, cpuc, i))
+                               x86_assign_hw_event(event, cpuc, i);
+                       else if (i < n_running)
+                               continue;
+
+                       if (hwc->state & PERF_HES_ARCH)
+                               continue;
+
+                       x86_pmu_start(event, PERF_EF_RELOAD);
+               }
+               cpuc->n_added = 0;
+               perf_events_lapic_init();
+       }
+
+       cpuc->enabled = 1;
+       barrier();
+
+       x86_pmu.enable_all(added);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       s64 left = local64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int ret = 0, idx = hwc->idx;
+
+       if (idx == INTEL_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * If we are way outside a reasonable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+       /*
+        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+        */
+       if (unlikely(left < 2))
+               left = 2;
+
+       if (left > x86_pmu.max_period)
+               left = x86_pmu.max_period;
+
+       if (x86_pmu.limit_period)
+               left = x86_pmu.limit_period(event, left);
+
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+           local64_read(&hwc->prev_count) != (u64)-left) {
+               /*
+                * The hw event starts counting from this event offset,
+                * mark it to be able to extra future deltas:
+                */
+               local64_set(&hwc->prev_count, (u64)-left);
+
+               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+       }
+
+       /*
+        * Due to erratum on certan cpu we need
+        * a second write to be sure the register
+        * is updated properly
+        */
+       if (x86_pmu.perfctr_second_write) {
+               wrmsrl(hwc->event_base,
+                       (u64)(-left) & x86_pmu.cntval_mask);
+       }
+
+       perf_event_update_userpage(event);
+
+       return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+       if (__this_cpu_read(cpu_hw_events.enabled))
+               __x86_pmu_enable_event(&event->hw,
+                                      ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc;
+       int assign[X86_PMC_IDX_MAX];
+       int n, n0, ret;
+
+       hwc = &event->hw;
+
+       n0 = cpuc->n_events;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       /*
+        * If group events scheduling transaction was started,
+        * skip the schedulability test here, it will be performed
+        * at commit time (->commit_txn) as a whole.
+        */
+       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+               goto done_collect;
+
+       ret = x86_pmu.schedule_events(cpuc, n, assign);
+       if (ret)
+               goto out;
+       /*
+        * copy new assignment, now we know it is possible
+        * will be used by hw_perf_enable()
+        */
+       memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+       /*
+        * Commit the collect_events() state. See x86_pmu_del() and
+        * x86_pmu_*_txn().
+        */
+       cpuc->n_events = n;
+       cpuc->n_added += n - n0;
+       cpuc->n_txn += n - n0;
+
+       ret = 0;
+out:
+       return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx = event->hw.idx;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               x86_perf_event_set_period(event);
+       }
+
+       event->hw.state = 0;
+
+       cpuc->events[idx] = event;
+       __set_bit(idx, cpuc->active_mask);
+       __set_bit(idx, cpuc->running);
+       x86_pmu.enable(event);
+       perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+       u64 pebs, debugctl;
+       struct cpu_hw_events *cpuc;
+       unsigned long flags;
+       int cpu, idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.version >= 2) {
+               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+               pr_info("\n");
+               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+               if (x86_pmu.pebs_constraints) {
+                       rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+                       pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+               }
+               if (x86_pmu.lbr_nr) {
+                       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+                       pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+               }
+       }
+       pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+               rdmsrl(x86_pmu_event_addr(idx), pmc_count);
+
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+                       cpu, idx, pmc_ctrl);
+               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+                       cpu, idx, prev_left);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+       }
+       local_irq_restore(flags);
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+               x86_pmu.disable(event);
+               cpuc->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               x86_perf_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int i;
+
+       /*
+        * event is descheduled
+        */
+       event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+       /*
+        * If we're called during a txn, we don't need to do anything.
+        * The events never got scheduled and ->cancel_txn will truncate
+        * the event_list.
+        *
+        * XXX assumes any ->del() called during a TXN will only be on
+        * an event added during that same TXN.
+        */
+       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+               return;
+
+       /*
+        * Not a TXN, therefore cleanup properly.
+        */
+       x86_pmu_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < cpuc->n_events; i++) {
+               if (event == cpuc->event_list[i])
+                       break;
+       }
+
+       if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+               return;
+
+       /* If we have a newly added event; make sure to decrease n_added. */
+       if (i >= cpuc->n_events - cpuc->n_added)
+               --cpuc->n_added;
+
+       if (x86_pmu.put_event_constraints)
+               x86_pmu.put_event_constraints(cpuc, event);
+
+       /* Delete the array entry. */
+       while (++i < cpuc->n_events) {
+               cpuc->event_list[i-1] = cpuc->event_list[i];
+               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+       }
+       --cpuc->n_events;
+
+       perf_event_update_userpage(event);
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       int idx, handled = 0;
+       u64 val;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * Some chipsets need to unmask the LVTPC in a particular spot
+        * inside the nmi handler.  As a result, the unmasking was pushed
+        * into all the nmi handlers.
+        *
+        * This generic handler doesn't seem to have any issues where the
+        * unmasking occurs so it was left at the top.
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask)) {
+                       /*
+                        * Though we deactivated the counter some cpus
+                        * might still deliver spurious interrupts still
+                        * in flight. Catch them:
+                        */
+                       if (__test_and_clear_bit(idx, cpuc->running))
+                               handled++;
+                       continue;
+               }
+
+               event = cpuc->events[idx];
+
+               val = x86_perf_event_update(event);
+               if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+                       continue;
+
+               /*
+                * event overflow
+                */
+               handled++;
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (!x86_perf_event_set_period(event))
+                       continue;
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+       if (!x86_pmu.apic || !x86_pmu_initialized())
+               return;
+
+       /*
+        * Always use NMI for PMU
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+       u64 start_clock;
+       u64 finish_clock;
+       int ret;
+
+       /*
+        * All PMUs/events that share this PMI handler should make sure to
+        * increment active_events for their events.
+        */
+       if (!atomic_read(&active_events))
+               return NMI_DONE;
+
+       start_clock = sched_clock();
+       ret = x86_pmu.handle_irq(regs);
+       finish_clock = sched_clock();
+
+       perf_sample_event_took(finish_clock - start_clock);
+
+       return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       int i, ret = NOTIFY_OK;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+                       cpuc->kfree_on_online[i] = NULL;
+               if (x86_pmu.cpu_prepare)
+                       ret = x86_pmu.cpu_prepare(cpu);
+               break;
+
+       case CPU_STARTING:
+               if (x86_pmu.cpu_starting)
+                       x86_pmu.cpu_starting(cpu);
+               break;
+
+       case CPU_ONLINE:
+               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+                       kfree(cpuc->kfree_on_online[i]);
+                       cpuc->kfree_on_online[i] = NULL;
+               }
+               break;
+
+       case CPU_DYING:
+               if (x86_pmu.cpu_dying)
+                       x86_pmu.cpu_dying(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               if (x86_pmu.cpu_dead)
+                       x86_pmu.cpu_dead(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+static void __init pmu_check_apic(void)
+{
+       if (cpu_has_apic)
+               return;
+
+       x86_pmu.apic = 0;
+       pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+       pr_info("no hardware sampling interrupt available.\n");
+
+       /*
+        * If we have a PMU initialized but no APIC
+        * interrupts, we cannot sample hardware
+        * events (user-space has to fall back and
+        * sample via a hrtimer based software event):
+        */
+       pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+       .name = "format",
+       .attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+       struct device_attribute *d;
+       struct perf_pmu_events_attr *pmu_attr;
+       int offset = 0;
+       int i, j;
+
+       for (i = 0; attrs[i]; i++) {
+               d = (struct device_attribute *)attrs[i];
+               pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+               /* str trumps id */
+               if (pmu_attr->event_str)
+                       continue;
+               if (x86_pmu.event_map(i + offset))
+                       continue;
+
+               for (j = i; attrs[j]; j++)
+                       attrs[j] = attrs[j + 1];
+
+               /* Check the shifted attr. */
+               i--;
+
+               /*
+                * event_map() is index based, the attrs array is organized
+                * by increasing event index. If we shift the events, then
+                * we need to compensate for the event_map(), otherwise
+                * we are looking up the wrong event in the map
+                */
+               offset++;
+       }
+}
+
+/* Merge two pointer arrays */
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+       struct attribute **new;
+       int j, i;
+
+       for (j = 0; a[j]; j++)
+               ;
+       for (i = 0; b[i]; i++)
+               j++;
+       j++;
+
+       new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
+       if (!new)
+               return NULL;
+
+       j = 0;
+       for (i = 0; a[i]; i++)
+               new[j++] = a[i];
+       for (i = 0; b[i]; i++)
+               new[j++] = b[i];
+       new[j] = NULL;
+
+       return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+                         char *page)
+{
+       struct perf_pmu_events_attr *pmu_attr = \
+               container_of(attr, struct perf_pmu_events_attr, attr);
+       u64 config = x86_pmu.event_map(pmu_attr->id);
+
+       /* string trumps id */
+       if (pmu_attr->event_str)
+               return sprintf(page, "%s", pmu_attr->event_str);
+
+       return x86_pmu.events_sysfs_show(page, config);
+}
+
+EVENT_ATTR(cpu-cycles,                 CPU_CYCLES              );
+EVENT_ATTR(instructions,               INSTRUCTIONS            );
+EVENT_ATTR(cache-references,           CACHE_REFERENCES        );
+EVENT_ATTR(cache-misses,               CACHE_MISSES            );
+EVENT_ATTR(branch-instructions,                BRANCH_INSTRUCTIONS     );
+EVENT_ATTR(branch-misses,              BRANCH_MISSES           );
+EVENT_ATTR(bus-cycles,                 BUS_CYCLES              );
+EVENT_ATTR(stalled-cycles-frontend,    STALLED_CYCLES_FRONTEND );
+EVENT_ATTR(stalled-cycles-backend,     STALLED_CYCLES_BACKEND  );
+EVENT_ATTR(ref-cycles,                 REF_CPU_CYCLES          );
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+       EVENT_PTR(CPU_CYCLES),
+       EVENT_PTR(INSTRUCTIONS),
+       EVENT_PTR(CACHE_REFERENCES),
+       EVENT_PTR(CACHE_MISSES),
+       EVENT_PTR(BRANCH_INSTRUCTIONS),
+       EVENT_PTR(BRANCH_MISSES),
+       EVENT_PTR(BUS_CYCLES),
+       EVENT_PTR(STALLED_CYCLES_FRONTEND),
+       EVENT_PTR(STALLED_CYCLES_BACKEND),
+       EVENT_PTR(REF_CPU_CYCLES),
+       NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+       .name = "events",
+       .attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+       u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+       u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+       bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+       bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+       bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
+       bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
+       ssize_t ret;
+
+       /*
+       * We have whole page size to spend and just little data
+       * to write, so we can safely use sprintf.
+       */
+       ret = sprintf(page, "event=0x%02llx", event);
+
+       if (umask)
+               ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+       if (edge)
+               ret += sprintf(page + ret, ",edge");
+
+       if (pc)
+               ret += sprintf(page + ret, ",pc");
+
+       if (any)
+               ret += sprintf(page + ret, ",any");
+
+       if (inv)
+               ret += sprintf(page + ret, ",inv");
+
+       if (cmask)
+               ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+       ret += sprintf(page + ret, "\n");
+
+       return ret;
+}
+
+static int __init init_hw_perf_events(void)
+{
+       struct x86_pmu_quirk *quirk;
+       int err;
+
+       pr_info("Performance Events: ");
+
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_INTEL:
+               err = intel_pmu_init();
+               break;
+       case X86_VENDOR_AMD:
+               err = amd_pmu_init();
+               break;
+       default:
+               err = -ENOTSUPP;
+       }
+       if (err != 0) {
+               pr_cont("no PMU driver, software events only.\n");
+               return 0;
+       }
+
+       pmu_check_apic();
+
+       /* sanity check that the hardware exists or is emulated */
+       if (!check_hw_exists())
+               return 0;
+
+       pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+       x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+               quirk->func();
+
+       if (!x86_pmu.intel_ctrl)
+               x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+       perf_events_lapic_init();
+       register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+       unconstrained = (struct event_constraint)
+               __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+                                  0, x86_pmu.num_counters, 0, 0);
+
+       x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+       if (x86_pmu.event_attrs)
+               x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+       if (!x86_pmu.events_sysfs_show)
+               x86_pmu_events_group.attrs = &empty_attrs;
+       else
+               filter_events(x86_pmu_events_group.attrs);
+
+       if (x86_pmu.cpu_events) {
+               struct attribute **tmp;
+
+               tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+               if (!WARN_ON(!tmp))
+                       x86_pmu_events_group.attrs = tmp;
+       }
+
+       pr_info("... version:                %d\n",     x86_pmu.version);
+       pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+       pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+       pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
+       pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+       pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+       pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
+
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+       perf_cpu_notifier(x86_pmu_notifier);
+
+       return 0;
+}
+early_initcall(init_hw_perf_events);
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+       x86_perf_event_update(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
+
+       cpuc->txn_flags = txn_flags;
+       if (txn_flags & ~PERF_PMU_TXN_ADD)
+               return;
+
+       perf_pmu_disable(pmu);
+       __this_cpu_write(cpu_hw_events.n_txn, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+       unsigned int txn_flags;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+       txn_flags = cpuc->txn_flags;
+       cpuc->txn_flags = 0;
+       if (txn_flags & ~PERF_PMU_TXN_ADD)
+               return;
+
+       /*
+        * Truncate collected array by the number of events added in this
+        * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+        */
+       __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+       __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+       perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int assign[X86_PMC_IDX_MAX];
+       int n, ret;
+
+       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+       if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+               cpuc->txn_flags = 0;
+               return 0;
+       }
+
+       n = cpuc->n_events;
+
+       if (!x86_pmu_initialized())
+               return -EAGAIN;
+
+       ret = x86_pmu.schedule_events(cpuc, n, assign);
+       if (ret)
+               return ret;
+
+       /*
+        * copy new assignment, now we know it is possible
+        * will be used by hw_perf_enable()
+        */
+       memcpy(cpuc->assign, assign, n*sizeof(int));
+
+       cpuc->txn_flags = 0;
+       perf_pmu_enable(pmu);
+       return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+       kfree(cpuc->shared_regs);
+       kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+       struct cpu_hw_events *cpuc;
+       int cpu = raw_smp_processor_id();
+
+       cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+       if (!cpuc)
+               return ERR_PTR(-ENOMEM);
+
+       /* only needed, if we have extra_regs */
+       if (x86_pmu.extra_regs) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       goto error;
+       }
+       cpuc->is_fake = 1;
+       return cpuc;
+error:
+       free_fake_cpuc(cpuc);
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+       struct cpu_hw_events *fake_cpuc;
+       struct event_constraint *c;
+       int ret = 0;
+
+       fake_cpuc = allocate_fake_cpuc();
+       if (IS_ERR(fake_cpuc))
+               return PTR_ERR(fake_cpuc);
+
+       c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+
+       if (!c || !c->weight)
+               ret = -EINVAL;
+
+       if (x86_pmu.put_event_constraints)
+               x86_pmu.put_event_constraints(fake_cpuc, event);
+
+       free_fake_cpuc(fake_cpuc);
+
+       return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ *     - check events are compatible which each other
+ *     - events do not compete for the same counter
+ *     - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct cpu_hw_events *fake_cpuc;
+       int ret = -EINVAL, n;
+
+       fake_cpuc = allocate_fake_cpuc();
+       if (IS_ERR(fake_cpuc))
+               return PTR_ERR(fake_cpuc);
+       /*
+        * the event is not yet connected with its
+        * siblings therefore we must first collect
+        * existing siblings, then add the new event
+        * before we can simulate the scheduling
+        */
+       n = collect_events(fake_cpuc, leader, true);
+       if (n < 0)
+               goto out;
+
+       fake_cpuc->n_events = n;
+       n = collect_events(fake_cpuc, event, false);
+       if (n < 0)
+               goto out;
+
+       fake_cpuc->n_events = n;
+
+       ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+       free_fake_cpuc(fake_cpuc);
+       return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+       struct pmu *tmp;
+       int err;
+
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
+       if (!err) {
+               /*
+                * we temporarily connect event to its pmu
+                * such that validate_group() can classify
+                * it as an x86 event using is_x86_event()
+                */
+               tmp = event->pmu;
+               event->pmu = &pmu;
+
+               if (event->group_leader != event)
+                       err = validate_group(event);
+               else
+                       err = validate_event(event);
+
+               event->pmu = tmp;
+       }
+       if (err) {
+               if (event->destroy)
+                       event->destroy(event);
+       }
+
+       if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+               event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
+       return err;
+}
+
+static void refresh_pce(void *ignored)
+{
+       if (current->mm)
+               load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return;
+
+       if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+       if (!current->mm)
+               return;
+
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return;
+
+       if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+       int idx = event->hw.idx;
+
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return 0;
+
+       if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+               idx -= INTEL_PMC_IDX_FIXED;
+               idx |= 1 << 30;
+       }
+
+       return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+       unsigned long val;
+       ssize_t ret;
+
+       ret = kstrtoul(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val > 2)
+               return -EINVAL;
+
+       if (x86_pmu.attr_rdpmc_broken)
+               return -ENOTSUPP;
+
+       if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+               /*
+                * Changing into or out of always available, aka
+                * perf-event-bypassing mode.  This path is extremely slow,
+                * but only root can trigger it, so it's okay.
+                */
+               if (val == 2)
+                       static_key_slow_inc(&rdpmc_always_available);
+               else
+                       static_key_slow_dec(&rdpmc_always_available);
+               on_each_cpu(refresh_pce, NULL, 1);
+       }
+
+       x86_pmu.attr_rdpmc = val;
+
+       return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+       &dev_attr_rdpmc.attr,
+       NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+       .attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+       &x86_pmu_attr_group,
+       &x86_pmu_format_group,
+       &x86_pmu_events_group,
+       NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (x86_pmu.sched_task)
+               x86_pmu.sched_task(ctx, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+       if (x86_pmu.check_microcode)
+               x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
+static struct pmu pmu = {
+       .pmu_enable             = x86_pmu_enable,
+       .pmu_disable            = x86_pmu_disable,
+
+       .attr_groups            = x86_pmu_attr_groups,
+
+       .event_init             = x86_pmu_event_init,
+
+       .event_mapped           = x86_pmu_event_mapped,
+       .event_unmapped         = x86_pmu_event_unmapped,
+
+       .add                    = x86_pmu_add,
+       .del                    = x86_pmu_del,
+       .start                  = x86_pmu_start,
+       .stop                   = x86_pmu_stop,
+       .read                   = x86_pmu_read,
+
+       .start_txn              = x86_pmu_start_txn,
+       .cancel_txn             = x86_pmu_cancel_txn,
+       .commit_txn             = x86_pmu_commit_txn,
+
+       .event_idx              = x86_pmu_event_idx,
+       .sched_task             = x86_pmu_sched_task,
+       .task_ctx_size          = sizeof(struct x86_perf_task_context),
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+                              struct perf_event_mmap_page *userpg, u64 now)
+{
+       struct cyc2ns_data *data;
+
+       userpg->cap_user_time = 0;
+       userpg->cap_user_time_zero = 0;
+       userpg->cap_user_rdpmc =
+               !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+       userpg->pmc_width = x86_pmu.cntval_bits;
+
+       if (!sched_clock_stable())
+               return;
+
+       data = cyc2ns_read_begin();
+
+       /*
+        * Internal timekeeping for enabled/running/stopped times
+        * is always in the local_clock domain.
+        */
+       userpg->cap_user_time = 1;
+       userpg->time_mult = data->cyc2ns_mul;
+       userpg->time_shift = data->cyc2ns_shift;
+       userpg->time_offset = data->cyc2ns_offset - now;
+
+       /*
+        * cap_user_time_zero doesn't make sense when we're using a different
+        * time base for the records.
+        */
+       if (event->clock == &local_clock) {
+               userpg->cap_user_time_zero = 1;
+               userpg->time_zero = data->cyc2ns_offset;
+       }
+
+       cyc2ns_read_end(data);
+}
+
+/*
+ * callchain support
+ */
+
+static int backtrace_stack(void *data, char *name)
+{
+       return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+       struct perf_callchain_entry *entry = data;
+
+       perf_callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+       .stack                  = backtrace_stack,
+       .address                = backtrace_address,
+       .walk_stack             = print_context_stack_bp,
+};
+
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
+
+       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+       return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+       struct desc_struct *desc;
+       int idx = segment >> 3;
+
+       if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+               struct ldt_struct *ldt;
+
+               if (idx > LDT_ENTRIES)
+                       return 0;
+
+               /* IRQs are off, so this synchronizes with smp_store_release */
+               ldt = lockless_dereference(current->active_mm->context.ldt);
+               if (!ldt || idx > ldt->size)
+                       return 0;
+
+               desc = &ldt->entries[idx];
+#else
+               return 0;
+#endif
+       } else {
+               if (idx > GDT_ENTRIES)
+                       return 0;
+
+               desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+       }
+
+       return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <asm/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       /* 32-bit process in 64-bit kernel. */
+       unsigned long ss_base, cs_base;
+       struct stack_frame_ia32 frame;
+       const void __user *fp;
+
+       if (!test_thread_flag(TIF_IA32))
+               return 0;
+
+       cs_base = get_segment_base(regs->cs);
+       ss_base = get_segment_base(regs->ss);
+
+       fp = compat_ptr(ss_base + regs->bp);
+       pagefault_disable();
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               unsigned long bytes;
+               frame.next_frame     = 0;
+               frame.return_address = 0;
+
+               if (!access_ok(VERIFY_READ, fp, 8))
+                       break;
+
+               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+               if (bytes != 0)
+                       break;
+               bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
+               if (bytes != 0)
+                       break;
+
+               if (!valid_user_frame(fp, sizeof(frame)))
+                       break;
+
+               perf_callchain_store(entry, cs_base + frame.return_address);
+               fp = compat_ptr(ss_base + frame.next_frame);
+       }
+       pagefault_enable();
+       return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+    return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+       struct stack_frame frame;
+       const void __user *fp;
+
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       /*
+        * We don't know what to do with VM86 stacks.. ignore them for now.
+        */
+       if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+               return;
+
+       fp = (void __user *)regs->bp;
+
+       perf_callchain_store(entry, regs->ip);
+
+       if (!current->mm)
+               return;
+
+       if (perf_callchain_user32(regs, entry))
+               return;
+
+       pagefault_disable();
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               unsigned long bytes;
+               frame.next_frame             = NULL;
+               frame.return_address = 0;
+
+               if (!access_ok(VERIFY_READ, fp, 16))
+                       break;
+
+               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+               if (bytes != 0)
+                       break;
+               bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
+               if (bytes != 0)
+                       break;
+
+               if (!valid_user_frame(fp, sizeof(frame)))
+                       break;
+
+               perf_callchain_store(entry, frame.return_address);
+               fp = (void __user *)frame.next_frame;
+       }
+       pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ *   VM86 - the good olde 16 bit days, where the linear address is
+ *          20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ *          to figure out what the 32bit base address is.
+ *
+ *    X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+       /*
+        * For IA32 we look at the GDT/LDT segment base to convert the
+        * effective IP to a linear address.
+        */
+
+#ifdef CONFIG_X86_32
+       /*
+        * If we are in VM86 mode, add the segment offset to convert to a
+        * linear address.
+        */
+       if (regs->flags & X86_VM_MASK)
+               return 0x10 * regs->cs;
+
+       if (user_mode(regs) && regs->cs != __USER_CS)
+               return get_segment_base(regs->cs);
+#else
+       if (user_mode(regs) && !user_64bit_mode(regs) &&
+           regs->cs != __USER32_CS)
+               return get_segment_base(regs->cs);
+#endif
+       return 0;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+               return perf_guest_cbs->get_guest_ip();
+
+       return regs->ip + code_segment_base(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       int misc = 0;
+
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               if (perf_guest_cbs->is_user_mode())
+                       misc |= PERF_RECORD_MISC_GUEST_USER;
+               else
+                       misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+       } else {
+               if (user_mode(regs))
+                       misc |= PERF_RECORD_MISC_USER;
+               else
+                       misc |= PERF_RECORD_MISC_KERNEL;
+       }
+
+       if (regs->flags & PERF_EFLAGS_EXACT)
+               misc |= PERF_RECORD_MISC_EXACT_IP;
+
+       return misc;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+       cap->version            = x86_pmu.version;
+       cap->num_counters_gp    = x86_pmu.num_counters;
+       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+       cap->bit_width_gp       = x86_pmu.cntval_bits;
+       cap->bit_width_fixed    = x86_pmu.cntval_bits;
+       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
+       cap->events_mask_len    = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c

new file mode 100644 (file)

index 0000000..b99dc92
--- /dev/null
+++ b/arch/x86/events/intel/bts.c
@@ -0,0 +1,544 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "../perf_event.h"
+
+struct bts_ctx {
+       struct perf_output_handle       handle;
+       struct debug_store              ds_back;
+       int                             started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE                24
+#define BTS_SAFETY_MARGIN      4080
+
+struct bts_phys {
+       struct page     *page;
+       unsigned long   size;
+       unsigned long   offset;
+       unsigned long   displacement;
+};
+
+struct bts_buffer {
+       size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
+       unsigned int    nr_pages;
+       unsigned int    nr_bufs;
+       unsigned int    cur_buf;
+       bool            snapshot;
+       local_t         data_size;
+       local_t         lost;
+       local_t         head;
+       unsigned long   end;
+       void            **data_pages;
+       struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+static size_t buf_size(struct page *page)
+{
+       return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+       struct bts_buffer *buf;
+       struct page *page;
+       int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+       unsigned long offset;
+       size_t size = nr_pages << PAGE_SHIFT;
+       int pg, nbuf, pad;
+
+       /* count all the high order buffers */
+       for (pg = 0, nbuf = 0; pg < nr_pages;) {
+               page = virt_to_page(pages[pg]);
+               if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+                       return NULL;
+               pg += 1 << page_private(page);
+               nbuf++;
+       }
+
+       /*
+        * to avoid interrupts in overwrite mode, only allow one physical
+        */
+       if (overwrite && nbuf > 1)
+               return NULL;
+
+       buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->nr_pages = nr_pages;
+       buf->nr_bufs = nbuf;
+       buf->snapshot = overwrite;
+       buf->data_pages = pages;
+       buf->real_size = size - size % BTS_RECORD_SIZE;
+
+       for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+               unsigned int __nr_pages;
+
+               page = virt_to_page(pages[pg]);
+               __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+               buf->buf[nbuf].page = page;
+               buf->buf[nbuf].offset = offset;
+               buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+               buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+               pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+               buf->buf[nbuf].size -= pad;
+
+               pg += __nr_pages;
+               offset += __nr_pages << PAGE_SHIFT;
+       }
+
+       return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+       kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+       return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+       int cpu = raw_smp_processor_id();
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       struct bts_phys *phys = &buf->buf[buf->cur_buf];
+       unsigned long index, thresh = 0, end = phys->size;
+       struct page *page = phys->page;
+
+       index = local_read(&buf->head);
+
+       if (!buf->snapshot) {
+               if (buf->end < phys->offset + buf_size(page))
+                       end = buf->end - phys->offset - phys->displacement;
+
+               index -= phys->offset + phys->displacement;
+
+               if (end - index > BTS_SAFETY_MARGIN)
+                       thresh = end - BTS_SAFETY_MARGIN;
+               else if (end - index > BTS_RECORD_SIZE)
+                       thresh = end - BTS_RECORD_SIZE;
+               else
+                       thresh = end;
+       }
+
+       ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+       ds->bts_index = ds->bts_buffer_base + index;
+       ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+       ds->bts_interrupt_threshold = !buf->snapshot
+               ? ds->bts_buffer_base + thresh
+               : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+       unsigned long index = head - phys->offset;
+
+       memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+       if (buf->snapshot)
+               return false;
+
+       if (local_read(&buf->data_size) >= bts->handle.size ||
+           bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+               return true;
+
+       return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+       int cpu = raw_smp_processor_id();
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+       if (!buf)
+               return;
+
+       head = index + bts_buffer_offset(buf, buf->cur_buf);
+       old = local_xchg(&buf->head, head);
+
+       if (!buf->snapshot) {
+               if (old == head)
+                       return;
+
+               if (ds->bts_index >= ds->bts_absolute_maximum)
+                       local_inc(&buf->lost);
+
+               /*
+                * old and head are always in the same physical buffer, so we
+                * can subtract them to get the data size.
+                */
+               local_add(head - old, &buf->data_size);
+       } else {
+               local_set(&buf->data_size, head);
+       }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       u64 config = 0;
+
+       if (!buf || bts_buffer_is_full(buf, bts))
+               return;
+
+       event->hw.itrace_started = 1;
+       event->hw.state = 0;
+
+       if (!buf->snapshot)
+               config |= ARCH_PERFMON_EVENTSEL_INT;
+       if (!event->attr.exclude_kernel)
+               config |= ARCH_PERFMON_EVENTSEL_OS;
+       if (!event->attr.exclude_user)
+               config |= ARCH_PERFMON_EVENTSEL_USR;
+
+       bts_config_buffer(buf);
+
+       /*
+        * local barrier to make sure that ds configuration made it
+        * before we enable BTS
+        */
+       wmb();
+
+       intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       __bts_event_start(event);
+
+       /* PMI handler: this counter is running and likely generating PMIs */
+       ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+       /*
+        * No extra synchronization is mandated by the documentation to have
+        * BTS data stores globally visible.
+        */
+       intel_pmu_disable_bts();
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
+       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       /* PMI handler: don't restart this counter */
+       ACCESS_ONCE(bts->started) = 0;
+
+       __bts_event_stop(event);
+
+       if (flags & PERF_EF_UPDATE)
+               bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       if (bts->handle.event && bts->started)
+               __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       if (bts->handle.event)
+               __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+       unsigned long head, space, next_space, pad, gap, skip, wakeup;
+       unsigned int next_buf;
+       struct bts_phys *phys, *next_phys;
+       int ret;
+
+       if (buf->snapshot)
+               return 0;
+
+       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+       if (WARN_ON_ONCE(head != local_read(&buf->head)))
+               return -EINVAL;
+
+       phys = &buf->buf[buf->cur_buf];
+       space = phys->offset + phys->displacement + phys->size - head;
+       pad = space;
+       if (space > handle->size) {
+               space = handle->size;
+               space -= space % BTS_RECORD_SIZE;
+       }
+       if (space <= BTS_SAFETY_MARGIN) {
+               /* See if next phys buffer has more space */
+               next_buf = buf->cur_buf + 1;
+               if (next_buf >= buf->nr_bufs)
+                       next_buf = 0;
+               next_phys = &buf->buf[next_buf];
+               gap = buf_size(phys->page) - phys->displacement - phys->size +
+                     next_phys->displacement;
+               skip = pad + gap;
+               if (handle->size >= skip) {
+                       next_space = next_phys->size;
+                       if (next_space + skip > handle->size) {
+                               next_space = handle->size - skip;
+                               next_space -= next_space % BTS_RECORD_SIZE;
+                       }
+                       if (next_space > space || !space) {
+                               if (pad)
+                                       bts_buffer_pad_out(phys, head);
+                               ret = perf_aux_output_skip(handle, skip);
+                               if (ret)
+                                       return ret;
+                               /* Advance to next phys buffer */
+                               phys = next_phys;
+                               space = next_space;
+                               head = phys->offset + phys->displacement;
+                               /*
+                                * After this, cur_buf and head won't match ds
+                                * anymore, so we must not be racing with
+                                * bts_update().
+                                */
+                               buf->cur_buf = next_buf;
+                               local_set(&buf->head, head);
+                       }
+               }
+       }
+
+       /* Don't go far beyond wakeup watermark */
+       wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+                handle->head;
+       if (space > wakeup) {
+               space = wakeup;
+               space -= space % BTS_RECORD_SIZE;
+       }
+
+       buf->end = head + space;
+
+       /*
+        * If we have no space, the lost notification would have been sent when
+        * we hit absolute_maximum - see bts_update()
+        */
+       if (!space)
+               return -ENOSPC;
+
+       return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct perf_event *event = bts->handle.event;
+       struct bts_buffer *buf;
+       s64 old_head;
+       int err;
+
+       if (!event || !bts->started)
+               return 0;
+
+       buf = perf_get_aux(&bts->handle);
+       /*
+        * Skip snapshot counters: they don't use the interrupt, but
+        * there's no other way of telling, because the pointer will
+        * keep moving
+        */
+       if (!buf || buf->snapshot)
+               return 0;
+
+       old_head = local_read(&buf->head);
+       bts_update(bts);
+
+       /* no new data */
+       if (old_head == local_read(&buf->head))
+               return 0;
+
+       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+                           !!local_xchg(&buf->lost, 0));
+
+       buf = perf_aux_output_begin(&bts->handle, event);
+       if (!buf)
+               return 1;
+
+       err = bts_buffer_reset(buf, &bts->handle);
+       if (err)
+               perf_aux_output_end(&bts->handle, 0, false);
+
+       return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+       bts_event_stop(event, PERF_EF_UPDATE);
+
+       if (buf) {
+               if (buf->snapshot)
+                       bts->handle.head =
+                               local_xchg(&buf->data_size,
+                                          buf->nr_pages << PAGE_SHIFT);
+               perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+                                   !!local_xchg(&buf->lost, 0));
+       }
+
+       cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+       cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+       cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+       cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+       struct bts_buffer *buf;
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = -EBUSY;
+
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               return -EBUSY;
+
+       if (bts->handle.event)
+               return -EBUSY;
+
+       buf = perf_aux_output_begin(&bts->handle, event);
+       if (!buf)
+               return -EINVAL;
+
+       ret = bts_buffer_reset(buf, &bts->handle);
+       if (ret) {
+               perf_aux_output_end(&bts->handle, 0, false);
+               return ret;
+       }
+
+       bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+       bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+       bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+       if (mode & PERF_EF_START) {
+               bts_event_start(event, 0);
+               if (hwc->state & PERF_HES_STOPPED) {
+                       bts_event_del(event, 0);
+                       return -EBUSY;
+               }
+       }
+
+       return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+       x86_release_hardware();
+       x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+       int ret;
+
+       if (event->attr.type != bts_pmu.type)
+               return -ENOENT;
+
+       if (x86_add_exclusive(x86_lbr_exclusive_bts))
+               return -EBUSY;
+
+       /*
+        * BTS leaks kernel addresses even when CPL0 tracing is
+        * disabled, so disallow intel_bts driver for unprivileged
+        * users on paranoid systems since it provides trace data
+        * to the user in a zero-copy fashion.
+        *
+        * Note that the default paranoia setting permits unprivileged
+        * users to profile the kernel.
+        */
+       if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+           !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       ret = x86_reserve_hardware();
+       if (ret) {
+               x86_del_exclusive(x86_lbr_exclusive_bts);
+               return ret;
+       }
+
+       event->destroy = bts_event_destroy;
+
+       return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+               return -ENODEV;
+
+       bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+       bts_pmu.task_ctx_nr     = perf_sw_context;
+       bts_pmu.event_init      = bts_event_init;
+       bts_pmu.add             = bts_event_add;
+       bts_pmu.del             = bts_event_del;
+       bts_pmu.start           = bts_event_start;
+       bts_pmu.stop            = bts_event_stop;
+       bts_pmu.read            = bts_event_read;
+       bts_pmu.setup_aux       = bts_buffer_setup_aux;
+       bts_pmu.free_aux        = bts_buffer_free_aux;
+
+       return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+arch_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c

new file mode 100644 (file)

index 0000000..68fa55b
--- /dev/null
+++ b/arch/x86/events/intel/core.c
@@ -0,0 +1,3796 @@
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+       [PERF_COUNT_HW_CPU_CYCLES]              = 0x003c,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x00c0,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4f2e,
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x412e,
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x00c4,
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x00c5,
+       [PERF_COUNT_HW_BUS_CYCLES]              = 0x013c,
+       [PERF_COUNT_HW_REF_CPU_CYCLES]          = 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+       INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+       INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+       INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+       INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+       INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+       INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+       INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+       INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+       INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+       INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+       EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+       INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+       INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+       INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+       EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7,
+                              MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x02b7,
+                              MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       /*
+        * Note the low 8 bits eventsel code is not a continuous field, containing
+        * some #GPing bits. These are masked out.
+        */
+       INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+       EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
+
+struct attribute *nhm_events_attrs[] = {
+       EVENT_PTR(mem_ld_nhm),
+       NULL,
+};
+
+struct attribute *snb_events_attrs[] = {
+       EVENT_PTR(mem_ld_snb),
+       EVENT_PTR(mem_st_snb),
+       NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+       /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+       /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+       /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),        /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+       EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+       return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
+#define SKL_DEMAND_RFO                 BIT_ULL(1)
+#define SKL_ANY_RESPONSE               BIT_ULL(16)
+#define SKL_SUPPLIER_NONE              BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
+#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT                    BIT_ULL(30)
+#define SKL_SNOOP_NONE                 BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define SKL_SNOOP_MISS                 BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define SKL_SNOOP_HITM                 BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
+#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+#define SNB_DMND_DATA_RD       (1ULL << 0)
+#define SNB_DMND_RFO           (1ULL << 1)
+#define SNB_DMND_IFETCH                (1ULL << 2)
+#define SNB_DMND_WB            (1ULL << 3)
+#define SNB_PF_DATA_RD         (1ULL << 4)
+#define SNB_PF_RFO             (1ULL << 5)
+#define SNB_PF_IFETCH          (1ULL << 6)
+#define SNB_LLC_DATA_RD                (1ULL << 7)
+#define SNB_LLC_RFO            (1ULL << 8)
+#define SNB_LLC_IFETCH         (1ULL << 9)
+#define SNB_BUS_LOCKS          (1ULL << 10)
+#define SNB_STRM_ST            (1ULL << 11)
+#define SNB_OTHER              (1ULL << 15)
+#define SNB_RESP_ANY           (1ULL << 16)
+#define SNB_NO_SUPP            (1ULL << 17)
+#define SNB_LLC_HITM           (1ULL << 18)
+#define SNB_LLC_HITE           (1ULL << 19)
+#define SNB_LLC_HITS           (1ULL << 20)
+#define SNB_LLC_HITF           (1ULL << 21)
+#define SNB_LOCAL              (1ULL << 22)
+#define SNB_REMOTE             (0xffULL << 23)
+#define SNB_SNP_NONE           (1ULL << 31)
+#define SNB_SNP_NOT_NEEDED     (1ULL << 32)
+#define SNB_SNP_MISS           (1ULL << 33)
+#define SNB_NO_FWD             (1ULL << 34)
+#define SNB_SNP_FWD            (1ULL << 35)
+#define SNB_HITM               (1ULL << 36)
+#define SNB_NON_DRAM           (1ULL << 37)
+
+#define SNB_DMND_READ          (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE         (SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+                                SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+                                SNB_HITM)
+
+#define SNB_DRAM_ANY           (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE                (SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS          SNB_RESP_ANY
+#define SNB_L3_MISS            (SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+       },
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+               [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
+#define HSW_DEMAND_RFO                 BIT_ULL(1)
+#define HSW_ANY_RESPONSE               BIT_ULL(16)
+#define HSW_SUPPLIER_NONE              BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
+#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE                 BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define HSW_SNOOP_MISS                 BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define HSW_SNOOP_HITM                 BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
+#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
+                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
+                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL              BIT(26)
+#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       /*
+        * Use RFO, not WRITEBACK, because a write miss would typically occur
+        * on RFO.
+        */
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD       (1 << 0)
+#define NHM_DMND_RFO           (1 << 1)
+#define NHM_DMND_IFETCH                (1 << 2)
+#define NHM_DMND_WB            (1 << 3)
+#define NHM_PF_DATA_RD         (1 << 4)
+#define NHM_PF_DATA_RFO                (1 << 5)
+#define NHM_PF_IFETCH          (1 << 6)
+#define NHM_OFFCORE_OTHER      (1 << 7)
+#define NHM_UNCORE_HIT         (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM    (1 << 10)
+                               /* reserved */
+#define NHM_REMOTE_CACHE_FWD   (1 << 12)
+#define NHM_REMOTE_DRAM                (1 << 13)
+#define NHM_LOCAL_DRAM         (1 << 14)
+#define NHM_NON_DRAM           (1 << 15)
+
+#define NHM_LOCAL              (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE             (NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ          (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE         (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH      (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT     (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS    (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS  (NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+       },
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       /*
+        * Use RFO, not WRITEBACK, because a write miss would typically occur
+        * on RFO.
+        */
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+       EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ          SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE         SNB_DMND_RFO
+#define SLM_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS         SNB_RESP_ANY
+#define SLM_LLC_MISS           (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+       },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+               [ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+               [ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+#define KNL_OT_L2_HITE         BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF         BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL       BIT_ULL(21)
+#define KNL_MCDRAM_FAR         BIT_ULL(22)
+#define KNL_DDR_LOCAL          BIT_ULL(23)
+#define KNL_DDR_FAR            BIT_ULL(24)
+#define KNL_DRAM_ANY           (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+                                   KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ            SLM_DMND_READ
+#define KNL_L2_WRITE           SLM_DMND_WRITE
+#define KNL_L2_PREFETCH                SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS          SLM_LLC_ACCESS
+#define KNL_L2_MISS            (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+                                  KNL_DRAM_ANY | SNB_SNP_ANY | \
+                                                 SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+       [C(LL)] = {
+               [C(OP_READ)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = 0,
+               },
+               [C(OP_WRITE)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+               },
+               [C(OP_PREFETCH)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+               },
+       },
+};
+
+/*
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
+ */
+static void __intel_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               intel_pmu_disable_bts();
+       else
+               intel_bts_disable_local();
+
+       intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+       __intel_pmu_disable_all();
+       intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       intel_pmu_pebs_enable_all();
+       intel_pmu_lbr_enable_all(pmi);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+                       x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+               struct perf_event *event =
+                       cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+               if (WARN_ON_ONCE(!event))
+                       return;
+
+               intel_pmu_enable_bts(event->hw.config);
+       } else
+               intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+       __intel_pmu_enable_all(added, false);
+}
+
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
+ *
+ * The official story:
+ *   These chips need to be 'reset' when adding counters by programming the
+ *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ *   in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       static const unsigned long nhm_magic[4] = {
+               0x4300B5,
+               0x4300D2,
+               0x4300B1,
+               0x4300B1
+       };
+       struct perf_event *event;
+       int i;
+
+       /*
+        * The Errata requires below steps:
+        * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+        * 2) Configure 4 PERFEVTSELx with the magic events and clear
+        *    the corresponding PMCx;
+        * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+        * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+        * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+        */
+
+       /*
+        * The real steps we choose are a little different from above.
+        * A) To reduce MSR operations, we don't run step 1) as they
+        *    are already cleared before this function is called;
+        * B) Call x86_perf_event_update to save PMCx before configuring
+        *    PERFEVTSELx with magic number;
+        * C) With step 5), we do clear only when the PERFEVTSELx is
+        *    not used currently.
+        * D) Call x86_perf_event_set_period to restore PMCx;
+        */
+
+       /* We always operate 4 pairs of PERF Counters */
+       for (i = 0; i < 4; i++) {
+               event = cpuc->events[i];
+               if (event)
+                       x86_perf_event_update(event);
+       }
+
+       for (i = 0; i < 4; i++) {
+               wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+               wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+       }
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+       for (i = 0; i < 4; i++) {
+               event = cpuc->events[i];
+
+               if (event) {
+                       x86_perf_event_set_period(event);
+                       __x86_pmu_enable_event(&event->hw,
+                                       ARCH_PERFMON_EVENTSEL_ENABLE);
+               } else
+                       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+       }
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+       if (added)
+               intel_pmu_nhm_workaround();
+       intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+       u64 ctrl_val, mask;
+
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+               intel_pmu_disable_bts();
+               intel_pmu_drain_bts_buffer();
+               return;
+       }
+
+       cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+       /*
+        * must disable before any actual event
+        * because any event may be combined with LBR
+        */
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_disable(event);
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_disable_fixed(hwc);
+               return;
+       }
+
+       x86_pmu_disable_event(event);
+
+       if (unlikely(event->attr.precise_ip))
+               intel_pmu_pebs_disable(event);
+}
+
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+       u64 ctrl_val, bits, mask;
+
+       /*
+        * Enable IRQ generation (0x8),
+        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+        * if requested:
+        */
+       bits = 0x8ULL;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+               bits |= 0x2;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+               bits |= 0x1;
+
+       /*
+        * ANY bit is supported in v3 and up
+        */
+       if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+               bits |= 0x4;
+
+       bits <<= (idx * 4);
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       ctrl_val |= bits;
+       wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+               if (!__this_cpu_read(cpu_hw_events.enabled))
+                       return;
+
+               intel_pmu_enable_bts(hwc->config);
+               return;
+       }
+       /*
+        * must enabled before any actual event
+        * because any event may be combined with LBR
+        */
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_enable(event);
+
+       if (event->attr.exclude_host)
+               cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+       if (event->attr.exclude_guest)
+               cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
+       if (unlikely(event_is_checkpointed(event)))
+               cpuc->intel_cp_status |= (1ull << hwc->idx);
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_enable_fixed(hwc);
+               return;
+       }
+
+       if (unlikely(event->attr.precise_ip))
+               intel_pmu_pebs_enable(event);
+
+       __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+       x86_perf_event_update(event);
+       /*
+        * For a checkpointed counter always reset back to 0.  This
+        * avoids a situation where the counter overflows, aborts the
+        * transaction and is then set back to shortly before the
+        * overflow, and overflows and aborts again.
+        */
+       if (unlikely(event_is_checkpointed(event))) {
+               /* No race with NMIs because the counter should not be armed */
+               wrmsrl(event->hw.event_base, 0);
+               local64_set(&event->hw.prev_count, 0);
+       }
+       return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+       unsigned long flags;
+       int idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+               wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+               wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+
+       if (ds)
+               ds->bts_index = ds->bts_buffer_base;
+
+       /* Ack all overflows and disable fixed counters */
+       if (x86_pmu.version >= 2) {
+               intel_pmu_ack_status(intel_pmu_get_status());
+               wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+       }
+
+       /* Reset LBRs and LBR freezing */
+       if (x86_pmu.lbr_nr) {
+               update_debugctlmsr(get_debugctlmsr() &
+                       ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+       }
+
+       local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int bit, loops;
+       u64 status;
+       int handled;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * No known reason to not always do late ACK,
+        * but just in case do it opt-in.
+        */
+       if (!x86_pmu.late_ack)
+               apic_write(APIC_LVTPC, APIC_DM_NMI);
+       __intel_pmu_disable_all();
+       handled = intel_pmu_drain_bts_buffer();
+       handled += intel_bts_interrupt();
+       status = intel_pmu_get_status();
+       if (!status)
+               goto done;
+
+       loops = 0;
+again:
+       intel_pmu_lbr_read();
+       intel_pmu_ack_status(status);
+       if (++loops > 100) {
+               static bool warned = false;
+               if (!warned) {
+                       WARN(1, "perfevents: irq loop stuck!\n");
+                       perf_event_print_debug();
+                       warned = true;
+               }
+               intel_pmu_reset();
+               goto done;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+
+
+       /*
+        * Ignore a range of extra bits in status that do not indicate
+        * overflow by themselves.
+        */
+       status &= ~(GLOBAL_STATUS_COND_CHG |
+                   GLOBAL_STATUS_ASIF |
+                   GLOBAL_STATUS_LBRS_FROZEN);
+       if (!status)
+               goto done;
+
+       /*
+        * PEBS overflow sets bit 62 in the global status register
+        */
+       if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+               handled++;
+               x86_pmu.drain_pebs(regs);
+               /*
+                * There are cases where, even though, the PEBS ovfl bit is set
+                * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+                * overflow bits set for their counters. We must clear them
+                * here because they have been processed as exact samples in
+                * the drain_pebs() routine. They must not be processed again
+                * in the for_each_bit_set() loop for regular samples below.
+                */
+               status &= ~cpuc->pebs_enabled;
+               status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
+       }
+
+       /*
+        * Intel PT
+        */
+       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+               handled++;
+               intel_pt_interrupt();
+       }
+
+       /*
+        * Checkpointed counters can lead to 'spurious' PMIs because the
+        * rollback caused by the PMI will have cleared the overflow status
+        * bit. Therefore always force probe these counters.
+        */
+       status |= cpuc->intel_cp_status;
+
+       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               handled++;
+
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (has_branch_stack(event))
+                       data.br_stack = &cpuc->lbr_stack;
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = intel_pmu_get_status();
+       if (status)
+               goto again;
+
+done:
+       /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+       if (cpuc->enabled)
+               __intel_pmu_enable_all(0, true);
+
+       /*
+        * Only unmask the NMI after the overflow counters
+        * have been reset. This avoids spurious NMIs on
+        * Haswell CPUs.
+        */
+       if (x86_pmu.late_ack)
+               apic_write(APIC_LVTPC, APIC_DM_NMI);
+       return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned int hw_event, bts_event;
+
+       if (event->attr.freq)
+               return NULL;
+
+       hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+       bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+       if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
+               return &bts_constraint;
+
+       return NULL;
+}
+
+static int intel_alt_er(int idx, u64 config)
+{
+       int alt_idx = idx;
+
+       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+               return idx;
+
+       if (idx == EXTRA_REG_RSP_0)
+               alt_idx = EXTRA_REG_RSP_1;
+
+       if (idx == EXTRA_REG_RSP_1)
+               alt_idx = EXTRA_REG_RSP_0;
+
+       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+               return idx;
+
+       return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+       event->hw.extra_reg.idx = idx;
+
+       if (idx == EXTRA_REG_RSP_0) {
+               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+       } else if (idx == EXTRA_REG_RSP_1) {
+               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+       }
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+                                  struct perf_event *event,
+                                  struct hw_perf_event_extra *reg)
+{
+       struct event_constraint *c = &emptyconstraint;
+       struct er_account *era;
+       unsigned long flags;
+       int idx = reg->idx;
+
+       /*
+        * reg->alloc can be set due to existing state, so for fake cpuc we
+        * need to ignore this, otherwise we might fail to allocate proper fake
+        * state for this extra reg constraint. Also see the comment below.
+        */
+       if (reg->alloc && !cpuc->is_fake)
+               return NULL; /* call x86_get_event_constraint() */
+
+again:
+       era = &cpuc->shared_regs->regs[idx];
+       /*
+        * we use spin_lock_irqsave() to avoid lockdep issues when
+        * passing a fake cpuc
+        */
+       raw_spin_lock_irqsave(&era->lock, flags);
+
+       if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+               /*
+                * If its a fake cpuc -- as per validate_{group,event}() we
+                * shouldn't touch event state and we can avoid doing so
+                * since both will only call get_event_constraints() once
+                * on each event, this avoids the need for reg->alloc.
+                *
+                * Not doing the ER fixup will only result in era->reg being
+                * wrong, but since we won't actually try and program hardware
+                * this isn't a problem either.
+                */
+               if (!cpuc->is_fake) {
+                       if (idx != reg->idx)
+                               intel_fixup_er(event, idx);
+
+                       /*
+                        * x86_schedule_events() can call get_event_constraints()
+                        * multiple times on events in the case of incremental
+                        * scheduling(). reg->alloc ensures we only do the ER
+                        * allocation once.
+                        */
+                       reg->alloc = 1;
+               }
+
+               /* lock in msr value */
+               era->config = reg->config;
+               era->reg = reg->reg;
+
+               /* one more user */
+               atomic_inc(&era->ref);
+
+               /*
+                * need to call x86_get_event_constraint()
+                * to check if associated event has constraints
+                */
+               c = NULL;
+       } else {
+               idx = intel_alt_er(idx, reg->config);
+               if (idx != reg->idx) {
+                       raw_spin_unlock_irqrestore(&era->lock, flags);
+                       goto again;
+               }
+       }
+       raw_spin_unlock_irqrestore(&era->lock, flags);
+
+       return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+                                  struct hw_perf_event_extra *reg)
+{
+       struct er_account *era;
+
+       /*
+        * Only put constraint if extra reg was actually allocated. Also takes
+        * care of event which do not use an extra shared reg.
+        *
+        * Also, if this is a fake cpuc we shouldn't touch any event state
+        * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+        * either since it'll be thrown out.
+        */
+       if (!reg->alloc || cpuc->is_fake)
+               return;
+
+       era = &cpuc->shared_regs->regs[reg->idx];
+
+       /* one fewer user */
+       atomic_dec(&era->ref);
+
+       /* allocate again next time */
+       reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+                             struct perf_event *event)
+{
+       struct event_constraint *c = NULL, *d;
+       struct hw_perf_event_extra *xreg, *breg;
+
+       xreg = &event->hw.extra_reg;
+       if (xreg->idx != EXTRA_REG_NONE) {
+               c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+               if (c == &emptyconstraint)
+                       return c;
+       }
+       breg = &event->hw.branch_reg;
+       if (breg->idx != EXTRA_REG_NONE) {
+               d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+               if (d == &emptyconstraint) {
+                       __intel_shared_reg_put_constraints(cpuc, xreg);
+                       c = d;
+               }
+       }
+       return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       if (x86_pmu.event_constraints) {
+               for_each_event_constraint(c, x86_pmu.event_constraints) {
+                       if ((event->hw.config & c->cmask) == c->code) {
+                               event->hw.flags |= c->flags;
+                               return c;
+                       }
+               }
+       }
+
+       return &unconstrained;
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       c = intel_bts_constraints(event);
+       if (c)
+               return c;
+
+       c = intel_shared_regs_constraints(cpuc, event);
+       if (c)
+               return c;
+
+       c = intel_pebs_constraints(event);
+       if (c)
+               return c;
+
+       return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = true;
+       /*
+        * lock shared state until we are done scheduling
+        * in stop_event_scheduling()
+        * makes scheduling appear as a transaction
+        */
+       raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct event_constraint *c = cpuc->event_constraint[idx];
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       lockdep_assert_held(&excl_cntrs->lock);
+
+       if (c->flags & PERF_X86_EVENT_EXCL)
+               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+       else
+               xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = false;
+       /*
+        * release shared state lock (acquired in intel_start_scheduling())
+        */
+       raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                          int idx, struct event_constraint *c)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xlo;
+       int tid = cpuc->excl_thread_id;
+       int is_excl, i;
+
+       /*
+        * validating a group does not require
+        * enforcing cross-thread  exclusion
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return c;
+
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return c;
+
+       /*
+        * because we modify the constraint, we need
+        * to make a copy. Static constraints come
+        * from static const tables.
+        *
+        * only needed when constraint has not yet
+        * been cloned (marked dynamic)
+        */
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+               struct event_constraint *cx;
+
+               /*
+                * grab pre-allocated constraint entry
+                */
+               cx = &cpuc->constraint_list[idx];
+
+               /*
+                * initialize dynamic constraint
+                * with static constraint
+                */
+               *cx = *c;
+
+               /*
+                * mark constraint as dynamic, so we
+                * can free it later on
+                */
+               cx->flags |= PERF_X86_EVENT_DYNAMIC;
+               c = cx;
+       }
+
+       /*
+        * From here on, the constraint is dynamic.
+        * Either it was just allocated above, or it
+        * was allocated during a earlier invocation
+        * of this function
+        */
+
+       /*
+        * state of sibling HT
+        */
+       xlo = &excl_cntrs->states[tid ^ 1];
+
+       /*
+        * event requires exclusive counter access
+        * across HT threads
+        */
+       is_excl = c->flags & PERF_X86_EVENT_EXCL;
+       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+               if (!cpuc->n_excl++)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+       }
+
+       /*
+        * Modify static constraint with current dynamic
+        * state of thread
+        *
+        * EXCLUSIVE: sibling counter measuring exclusive event
+        * SHARED   : sibling counter measuring non-exclusive event
+        * UNUSED   : sibling counter unused
+        */
+       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+               /*
+                * exclusive event in sibling counter
+                * our corresponding counter cannot be used
+                * regardless of our event
+                */
+               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+                       __clear_bit(i, c->idxmsk);
+               /*
+                * if measuring an exclusive event, sibling
+                * measuring non-exclusive, then counter cannot
+                * be used
+                */
+               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+                       __clear_bit(i, c->idxmsk);
+       }
+
+       /*
+        * recompute actual bit weight for scheduling algorithm
+        */
+       c->weight = hweight64(c->idxmsk64);
+
+       /*
+        * if we return an empty mask, then switch
+        * back to static empty constraint to avoid
+        * the cost of freeing later on
+        */
+       if (c->weight == 0)
+               c = &emptyconstraint;
+
+       return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c1 = NULL;
+       struct event_constraint *c2;
+
+       if (idx >= 0) /* fake does < 0 */
+               c1 = cpuc->event_constraint[idx];
+
+       /*
+        * first time only
+        * - static constraint: no change across incremental scheduling calls
+        * - dynamic constraint: handled by intel_get_excl_constraints()
+        */
+       c2 = __intel_get_event_constraints(cpuc, idx, event);
+       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+               c1->weight = c2->weight;
+               c2 = c1;
+       }
+
+       if (cpuc->excl_cntrs)
+               return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+       return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+               struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       int tid = cpuc->excl_thread_id;
+       struct intel_excl_states *xl;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake)
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+               if (!--cpuc->n_excl)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+       }
+
+       /*
+        * If event was actually assigned, then mark the counter state as
+        * unused now.
+        */
+       if (hwc->idx >= 0) {
+               xl = &excl_cntrs->states[tid];
+
+               /*
+                * put_constraint may be called from x86_schedule_events()
+                * which already has the lock held so here make locking
+                * conditional.
+                */
+               if (!xl->sched_started)
+                       raw_spin_lock(&excl_cntrs->lock);
+
+               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+               if (!xl->sched_started)
+                       raw_spin_unlock(&excl_cntrs->lock);
+       }
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+                                       struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+
+       reg = &event->hw.extra_reg;
+       if (reg->idx != EXTRA_REG_NONE)
+               __intel_shared_reg_put_constraints(cpuc, reg);
+
+       reg = &event->hw.branch_reg;
+       if (reg->idx != EXTRA_REG_NONE)
+               __intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+                                       struct perf_event *event)
+{
+       intel_put_shared_regs_event_constraints(cpuc, event);
+
+       /*
+        * is PMU has exclusive counter restrictions, then
+        * all events are subject to and must call the
+        * put_excl_constraints() routine
+        */
+       if (cpuc->excl_cntrs)
+               intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.ANY_P
+                * (0x00c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * INST_RETIRED.ANY_P counts the number of cycles that retires
+                * CNTMASK instructions. By setting CNTMASK to a value (16)
+                * larger than the maximum number of instructions that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less instructions, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use UOPS_RETIRED.ALL
+                * (0x01c2), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * UOPS_RETIRED.ALL counts the number of cycles that retires
+                * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+                * larger than the maximum number of micro-ops that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less micro-ops, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+                * (0x01c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * The PREC_DIST event has special support to minimize sample
+                * shadowing effects. One drawback is that it can be
+                * only programmed on counter 1, but that seems like an
+                * acceptable trade off.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+       if (event->attr.precise_ip < 3)
+               return intel_pebs_aliases_snb(event);
+       return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+       if (event->attr.precise_ip < 3)
+               return intel_pebs_aliases_core2(event);
+       return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+       unsigned long flags = x86_pmu.free_running_flags;
+
+       if (event->attr.use_clockid)
+               flags &= ~PERF_SAMPLE_TIME;
+       return flags;
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+       int ret = x86_pmu_hw_config(event);
+
+       if (ret)
+               return ret;
+
+       if (event->attr.precise_ip) {
+               if (!event->attr.freq) {
+                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+                       if (!(event->attr.sample_type &
+                             ~intel_pmu_free_running_flags(event)))
+                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+               }
+               if (x86_pmu.pebs_aliases)
+                       x86_pmu.pebs_aliases(event);
+       }
+
+       if (needs_branch_stack(event)) {
+               ret = intel_pmu_setup_lbr_filter(event);
+               if (ret)
+                       return ret;
+
+               /*
+                * BTS is set up earlier in this path, so don't account twice
+                */
+               if (!intel_pmu_has_bts(event)) {
+                       /* disallow lbr if conflicting events are present */
+                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                               return -EBUSY;
+
+                       event->destroy = hw_perf_lbr_event_destroy;
+               }
+       }
+
+       if (event->attr.type != PERF_TYPE_RAW)
+               return 0;
+
+       if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+               return 0;
+
+       if (x86_pmu.version < 3)
+               return -EINVAL;
+
+       if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+       return 0;
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+       if (x86_pmu.guest_get_msrs)
+               return x86_pmu.guest_get_msrs(nr);
+       *nr = 0;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+       arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+       arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+       /*
+        * If PMU counter has PEBS enabled it is not enough to disable counter
+        * on a guest entry since PEBS memory write can overshoot guest entry
+        * and corrupt guest memory. Disabling PEBS solves the problem.
+        */
+       arr[1].msr = MSR_IA32_PEBS_ENABLE;
+       arr[1].host = cpuc->pebs_enabled;
+       arr[1].guest = 0;
+
+       *nr = 2;
+       return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
+               struct perf_event *event = cpuc->events[idx];
+
+               arr[idx].msr = x86_pmu_config_addr(idx);
+               arr[idx].host = arr[idx].guest = 0;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               arr[idx].host = arr[idx].guest =
+                       event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+               if (event->attr.exclude_host)
+                       arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+               else if (event->attr.exclude_guest)
+                       arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+       }
+
+       *nr = x86_pmu.num_counters;
+       return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+       if (!event->attr.exclude_host)
+               x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+               if (!test_bit(idx, cpuc->active_mask) ||
+                               cpuc->events[idx]->attr.exclude_host)
+                       continue;
+
+               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+       }
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+       int ret = intel_pmu_hw_config(event);
+
+       if (ret)
+               return ret;
+       if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+               return 0;
+       event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+       /*
+        * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+        * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+        * this combination.
+        */
+       if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+            ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+             event->attr.precise_ip > 0))
+               return -EOPNOTSUPP;
+
+       if (event_is_checkpointed(event)) {
+               /*
+                * Sampling of checkpointed events can cause situations where
+                * the CPU constantly aborts because of a overflow, which is
+                * then checkpointed back and ignored. Forbid checkpointing
+                * for sampling.
+                *
+                * But still allow a long sampling period, so that perf stat
+                * from KVM works.
+                */
+               if (event->attr.sample_period > 0 &&
+                   event->attr.sample_period < 0x7fffffff)
+                       return -EOPNOTSUPP;
+       }
+       return 0;
+}
+
+static struct event_constraint counter2_constraint =
+                       EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       c = intel_get_event_constraints(cpuc, idx, event);
+
+       /* Handle special quirk on in_tx_checkpointed only in counter 2 */
+       if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+               if (c->idxmsk64 & (1U << 2))
+                       return &counter2_constraint;
+               return &emptyconstraint;
+       }
+
+       return c;
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
+               if (left < 128)
+                       left = 128;
+               left &= ~0x3fu;
+       }
+       return left;
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(pc,    "config:19"     );
+PMU_FORMAT_ATTR(any,   "config:21"     ); /* v3 + */
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+PMU_FORMAT_ATTR(in_tx,  "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+       return x86_event_sysfs_show(page, config, event);
+}
+
+struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+       struct intel_shared_regs *regs;
+       int i;
+
+       regs = kzalloc_node(sizeof(struct intel_shared_regs),
+                           GFP_KERNEL, cpu_to_node(cpu));
+       if (regs) {
+               /*
+                * initialize the locks to keep lockdep happy
+                */
+               for (i = 0; i < EXTRA_REG_MAX; i++)
+                       raw_spin_lock_init(&regs->regs[i].lock);
+
+               regs->core_id = -1;
+       }
+       return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+       struct intel_excl_cntrs *c;
+
+       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+                        GFP_KERNEL, cpu_to_node(cpu));
+       if (c) {
+               raw_spin_lock_init(&c->lock);
+               c->core_id = -1;
+       }
+       return c;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       goto err;
+       }
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+               if (!cpuc->constraint_list)
+                       goto err_shared_regs;
+
+               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+               if (!cpuc->excl_cntrs)
+                       goto err_constraint_list;
+
+               cpuc->excl_thread_id = 0;
+       }
+
+       return NOTIFY_OK;
+
+err_constraint_list:
+       kfree(cpuc->constraint_list);
+       cpuc->constraint_list = NULL;
+
+err_shared_regs:
+       kfree(cpuc->shared_regs);
+       cpuc->shared_regs = NULL;
+
+err:
+       return NOTIFY_BAD;
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       int core_id = topology_core_id(cpu);
+       int i;
+
+       init_debug_store_on_cpu(cpu);
+       /*
+        * Deal with CPUs that don't clear their LBRs on power-up.
+        */
+       intel_pmu_lbr_reset();
+
+       cpuc->lbr_sel = NULL;
+
+       if (!cpuc->shared_regs)
+               return;
+
+       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+                       struct intel_shared_regs *pc;
+
+                       pc = per_cpu(cpu_hw_events, i).shared_regs;
+                       if (pc && pc->core_id == core_id) {
+                               cpuc->kfree_on_online[0] = cpuc->shared_regs;
+                               cpuc->shared_regs = pc;
+                               break;
+                       }
+               }
+               cpuc->shared_regs->core_id = core_id;
+               cpuc->shared_regs->refcnt++;
+       }
+
+       if (x86_pmu.lbr_sel_map)
+               cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+                       struct intel_excl_cntrs *c;
+
+                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
+                       if (c && c->core_id == core_id) {
+                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+                               cpuc->excl_cntrs = c;
+                               cpuc->excl_thread_id = 1;
+                               break;
+                       }
+               }
+               cpuc->excl_cntrs->core_id = core_id;
+               cpuc->excl_cntrs->refcnt++;
+       }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_excl_cntrs *c;
+
+       c = cpuc->excl_cntrs;
+       if (c) {
+               if (c->core_id == -1 || --c->refcnt == 0)
+                       kfree(c);
+               cpuc->excl_cntrs = NULL;
+               kfree(cpuc->constraint_list);
+               cpuc->constraint_list = NULL;
+       }
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_shared_regs *pc;
+
+       pc = cpuc->shared_regs;
+       if (pc) {
+               if (pc->core_id == -1 || --pc->refcnt == 0)
+                       kfree(pc);
+               cpuc->shared_regs = NULL;
+       }
+
+       free_excl_cntrs(cpu);
+
+       fini_debug_store_on_cpu(cpu);
+}
+
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+                                bool sched_in)
+{
+       if (x86_pmu.pebs_active)
+               intel_pmu_pebs_sched_task(ctx, sched_in);
+       if (x86_pmu.lbr_nr)
+               intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_any.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       &format_attr_in_tx.attr,
+       &format_attr_in_tx_cp.attr,
+
+       &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+       &format_attr_ldlat.attr, /* PEBS load latency */
+       NULL,
+};
+
+static struct attribute *skl_format_attr[] = {
+       &format_attr_frontend.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+       .name                   = "core",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = x86_pmu_disable_all,
+       .enable_all             = core_pmu_enable_all,
+       .enable                 = core_pmu_enable_event,
+       .disable                = x86_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+
+       /*
+        * Intel PMCs cannot be accessed sanely above 32-bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL<<31) - 1,
+       .get_event_constraints  = intel_get_event_constraints,
+       .put_event_constraints  = intel_put_event_constraints,
+       .event_constraints      = intel_core_event_constraints,
+       .guest_get_msrs         = core_guest_get_msrs,
+       .format_attrs           = intel_arch_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+       /*
+        * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+        * together with PMU version 1 and thus be using core_pmu with
+        * shared_regs. We need following callbacks here to allocate
+        * it properly.
+        */
+       .cpu_prepare            = intel_pmu_cpu_prepare,
+       .cpu_starting           = intel_pmu_cpu_starting,
+       .cpu_dying              = intel_pmu_cpu_dying,
+};
+
+static __initconst const struct x86_pmu intel_pmu = {
+       .name                   = "Intel",
+       .handle_irq             = intel_pmu_handle_irq,
+       .disable_all            = intel_pmu_disable_all,
+       .enable_all             = intel_pmu_enable_all,
+       .enable                 = intel_pmu_enable_event,
+       .disable                = intel_pmu_disable_event,
+       .hw_config              = intel_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       /*
+        * Intel PMCs cannot be accessed sanely above 32 bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL << 31) - 1,
+       .get_event_constraints  = intel_get_event_constraints,
+       .put_event_constraints  = intel_put_event_constraints,
+       .pebs_aliases           = intel_pebs_aliases_core2,
+
+       .format_attrs           = intel_arch3_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+       .cpu_prepare            = intel_pmu_cpu_prepare,
+       .cpu_starting           = intel_pmu_cpu_starting,
+       .cpu_dying              = intel_pmu_cpu_dying,
+       .guest_get_msrs         = intel_guest_get_msrs,
+       .sched_task             = intel_pmu_sched_task,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+       /*
+        * PEBS is unreliable due to:
+        *
+        *   AJ67  - PEBS may experience CPL leaks
+        *   AJ68  - PEBS PMI may be delayed by one event
+        *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+        *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+        *
+        * AJ67 could be worked around by restricting the OS/USR flags.
+        * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+        *
+        * AJ106 could possibly be worked around by not allowing LBR
+        *       usage from PEBS, including the fixup.
+        * AJ68  could possibly be worked around by always programming
+        *       a pebs_event_reset[0] value and coping with the lost events.
+        *
+        * But taken together it might just make sense to not enable PEBS on
+        * these chips.
+        */
+       pr_warn("PEBS disabled due to CPU errata\n");
+       x86_pmu.pebs = 0;
+       x86_pmu.pebs_constraints = NULL;
+}
+
+static int intel_snb_pebs_broken(int cpu)
+{
+       u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+       switch (cpu_data(cpu).x86_model) {
+       case 42: /* SNB */
+               rev = 0x28;
+               break;
+
+       case 45: /* SNB-EP */
+               switch (cpu_data(cpu).x86_mask) {
+               case 6: rev = 0x618; break;
+               case 7: rev = 0x70c; break;
+               }
+       }
+
+       return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+       int pebs_broken = 0;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+                       break;
+       }
+       put_online_cpus();
+
+       if (pebs_broken == x86_pmu.pebs_broken)
+               return;
+
+       /*
+        * Serialized by the microcode lock..
+        */
+       if (x86_pmu.pebs_broken) {
+               pr_info("PEBS enabled due to microcode update\n");
+               x86_pmu.pebs_broken = 0;
+       } else {
+               pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+               x86_pmu.pebs_broken = 1;
+       }
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+       u64 val_old, val_new, val_tmp;
+
+       /*
+        * Read the current value, change it and read it back to see if it
+        * matches, this is needed to detect certain hardware emulators
+        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+        */
+       if (rdmsrl_safe(msr, &val_old))
+               return false;
+
+       /*
+        * Only change the bits which can be updated by wrmsrl.
+        */
+       val_tmp = val_old ^ mask;
+       if (wrmsrl_safe(msr, val_tmp) ||
+           rdmsrl_safe(msr, &val_new))
+               return false;
+
+       if (val_new != val_tmp)
+               return false;
+
+       /* Here it's sure that the MSR can be safely accessed.
+        * Restore the old value and return.
+        */
+       wrmsrl(msr, val_old);
+
+       return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+       x86_pmu.check_microcode = intel_snb_check_microcode;
+       intel_snb_check_microcode();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+       int bit;
+
+       /* disable event that reported as not presend by cpuid */
+       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+               pr_warn("CPUID marked event: \'%s\' unavailable\n",
+                       intel_arch_events_map[bit].name);
+       }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+       union cpuid10_ebx ebx;
+
+       ebx.full = x86_pmu.events_maskl;
+       if (ebx.split.no_branch_misses_retired) {
+               /*
+                * Erratum AAJ80 detected, we work it around by using
+                * the BR_MISP_EXEC.ANY event. This will over-count
+                * branch-misses, but it's still much better than the
+                * architectural event which is often completely bogus:
+                */
+               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+               ebx.split.no_branch_misses_retired = 0;
+               x86_pmu.events_maskl = ebx.full;
+               pr_info("CPU erratum AAJ80 worked around\n");
+       }
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+       x86_pmu.start_scheduling = intel_start_scheduling;
+       x86_pmu.commit_scheduling = intel_commit_scheduling;
+       x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+       EVENT_PTR(tx_start),
+       EVENT_PTR(tx_commit),
+       EVENT_PTR(tx_abort),
+       EVENT_PTR(tx_capacity),
+       EVENT_PTR(tx_conflict),
+       EVENT_PTR(el_start),
+       EVENT_PTR(el_commit),
+       EVENT_PTR(el_abort),
+       EVENT_PTR(el_capacity),
+       EVENT_PTR(el_conflict),
+       EVENT_PTR(cycles_t),
+       EVENT_PTR(cycles_ct),
+       EVENT_PTR(mem_ld_hsw),
+       EVENT_PTR(mem_st_hsw),
+       NULL
+};
+
+__init int intel_pmu_init(void)
+{
+       union cpuid10_edx edx;
+       union cpuid10_eax eax;
+       union cpuid10_ebx ebx;
+       struct event_constraint *c;
+       unsigned int unused;
+       struct extra_reg *er;
+       int version, i;
+
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+               switch (boot_cpu_data.x86) {
+               case 0x6:
+                       return p6_pmu_init();
+               case 0xb:
+                       return knc_pmu_init();
+               case 0xf:
+                       return p4_pmu_init();
+               }
+               return -ENODEV;
+       }
+
+       /*
+        * Check whether the Architectural PerfMon supports
+        * Branch Misses Retired hw_event or not.
+        */
+       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+               return -ENODEV;
+
+       version = eax.split.version_id;
+       if (version < 2)
+               x86_pmu = core_pmu;
+       else
+               x86_pmu = intel_pmu;
+
+       x86_pmu.version                 = version;
+       x86_pmu.num_counters            = eax.split.num_counters;
+       x86_pmu.cntval_bits             = eax.split.bit_width;
+       x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
+
+       x86_pmu.events_maskl            = ebx.full;
+       x86_pmu.events_mask_len         = eax.split.mask_length;
+
+       x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+       /*
+        * Quirk: v2 perfmon does not report fixed-purpose events, so
+        * assume at least 3 events:
+        */
+       if (version > 1)
+               x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+       if (boot_cpu_has(X86_FEATURE_PDCM)) {
+               u64 capabilities;
+
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+               x86_pmu.intel_cap.capabilities = capabilities;
+       }
+
+       intel_ds_init();
+
+       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+       /*
+        * Install the hw-cache-events table:
+        */
+       switch (boot_cpu_data.x86_model) {
+       case 14: /* 65nm Core "Yonah" */
+               pr_cont("Core events, ");
+               break;
+
+       case 15: /* 65nm Core2 "Merom"          */
+               x86_add_quirk(intel_clovertown_quirk);
+       case 22: /* 65nm Core2 "Merom-L"        */
+       case 23: /* 45nm Core2 "Penryn"         */
+       case 29: /* 45nm Core2 "Dunnington (MP) */
+               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               intel_pmu_lbr_init_core();
+
+               x86_pmu.event_constraints = intel_core2_event_constraints;
+               x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+               pr_cont("Core2 events, ");
+               break;
+
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_nhm();
+
+               x86_pmu.event_constraints = intel_nehalem_event_constraints;
+               x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+               x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+               x86_pmu.cpu_events = nhm_events_attrs;
+
+               /* UOPS_ISSUED.STALLED_CYCLES */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+               intel_pmu_pebs_data_source_nhm();
+               x86_add_quirk(intel_nehalem_quirk);
+
+               pr_cont("Nehalem events, ");
+               break;
+
+       case 28: /* 45nm Atom "Pineview"   */
+       case 38: /* 45nm Atom "Lincroft"   */
+       case 39: /* 32nm Atom "Penwell"    */
+       case 53: /* 32nm Atom "Cloverview" */
+       case 54: /* 32nm Atom "Cedarview"  */
+               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               intel_pmu_lbr_init_atom();
+
+               x86_pmu.event_constraints = intel_gen_event_constraints;
+               x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+               pr_cont("Atom events, ");
+               break;
+
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 76: /* 14nm Atom "Airmont"                   */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+               memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+                       sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_atom();
+
+               x86_pmu.event_constraints = intel_slm_event_constraints;
+               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_slm_extra_regs;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               pr_cont("Silvermont events, ");
+               break;
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_nhm();
+
+               x86_pmu.event_constraints = intel_westmere_event_constraints;
+               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+               x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_westmere_extra_regs;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+               x86_pmu.cpu_events = nhm_events_attrs;
+
+               /* UOPS_ISSUED.STALLED_CYCLES */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+               intel_pmu_pebs_data_source_nhm();
+               pr_cont("Westmere events, ");
+               break;
+
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+               x86_add_quirk(intel_sandybridge_quirk);
+               x86_add_quirk(intel_ht_bug);
+               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_snb_event_constraints;
+               x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+               if (boot_cpu_data.x86_model == 45)
+                       x86_pmu.extra_regs = intel_snbep_extra_regs;
+               else
+                       x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.cpu_events = snb_events_attrs;
+
+               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+               pr_cont("SandyBridge events, ");
+               break;
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+               x86_add_quirk(intel_ht_bug);
+               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               /* dTLB-load-misses on IVB is different than SNB */
+               hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_ivb_event_constraints;
+               x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               if (boot_cpu_data.x86_model == 62)
+                       x86_pmu.extra_regs = intel_snbep_extra_regs;
+               else
+                       x86_pmu.extra_regs = intel_snb_extra_regs;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.cpu_events = snb_events_attrs;
+
+               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+               pr_cont("IvyBridge events, ");
+               break;
+
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+               x86_add_quirk(intel_ht_bug);
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_hsw();
+
+               x86_pmu.event_constraints = intel_hsw_event_constraints;
+               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.lbr_double_abort = true;
+               pr_cont("Haswell events, ");
+               break;
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+                                                                         HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+               intel_pmu_lbr_init_hsw();
+
+               x86_pmu.event_constraints = intel_bdw_event_constraints;
+               x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.limit_period = bdw_limit_period;
+               pr_cont("Broadwell events, ");
+               break;
+
+       case 87: /* Knights Landing Xeon Phi */
+               memcpy(hw_cache_event_ids,
+                      slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs,
+                      knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_knl();
+
+               x86_pmu.event_constraints = intel_slm_event_constraints;
+               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_knl_extra_regs;
+
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               pr_cont("Knights Landing events, ");
+               break;
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_skl();
+
+               x86_pmu.event_constraints = intel_skl_event_constraints;
+               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_skl_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+                                                 skl_format_attr);
+               WARN_ON(!x86_pmu.format_attrs);
+               x86_pmu.cpu_events = hsw_events_attrs;
+               pr_cont("Skylake events, ");
+               break;
+
+       default:
+               switch (x86_pmu.version) {
+               case 1:
+                       x86_pmu.event_constraints = intel_v1_event_constraints;
+                       pr_cont("generic architected perfmon v1, ");
+                       break;
+               default:
+                       /*
+                        * default constraints for v2 and up
+                        */
+                       x86_pmu.event_constraints = intel_gen_event_constraints;
+                       pr_cont("generic architected perfmon, ");
+                       break;
+               }
+       }
+
+       if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+                    x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+               x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+       }
+       x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+       if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+                    x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+               x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+       }
+
+       x86_pmu.intel_ctrl |=
+               ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+       if (x86_pmu.event_constraints) {
+               /*
+                * event on fixed counter2 (REF_CYCLES) only works on this
+                * counter, so do not extend mask to generic counters
+                */
+               for_each_event_constraint(c, x86_pmu.event_constraints) {
+                       if (c->cmask == FIXED_EVENT_FLAGS
+                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+                       }
+                       c->idxmsk64 &=
+                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+                       c->weight = hweight64(c->idxmsk64);
+               }
+       }
+
+       /*
+        * Access LBR MSR may cause #GP under certain circumstances.
+        * E.g. KVM doesn't support LBR MSR
+        * Check all LBT MSR here.
+        * Disable LBR access if any LBR MSRs can not be accessed.
+        */
+       if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+               x86_pmu.lbr_nr = 0;
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+                     check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+                       x86_pmu.lbr_nr = 0;
+       }
+
+       /*
+        * Access extra MSR may cause #GP under certain circumstances.
+        * E.g. KVM doesn't support offcore event
+        * Check all extra_regs here.
+        */
+       if (x86_pmu.extra_regs) {
+               for (er = x86_pmu.extra_regs; er->msr; er++) {
+                       er->extra_msr_access = check_msr(er->msr, 0x11UL);
+                       /* Disable LBR select mapping */
+                       if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+                               x86_pmu.lbr_sel_map = NULL;
+               }
+       }
+
+       /* Support full width counters using alternative MSR range */
+       if (x86_pmu.intel_cap.full_width_write) {
+               x86_pmu.max_period = x86_pmu.cntval_mask;
+               x86_pmu.perfctr = MSR_IA32_PMC0;
+               pr_cont("full-width counters, ");
+       }
+
+       return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+       int cpu = smp_processor_id();
+       int w, c;
+       /*
+        * problem not present on this CPU model, nothing to do
+        */
+       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+               return 0;
+
+       w = cpumask_weight(topology_sibling_cpumask(cpu));
+       if (w > 1) {
+               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+               return 0;
+       }
+
+       if (lockup_detector_suspend() != 0) {
+               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+               return 0;
+       }
+
+       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+       x86_pmu.start_scheduling = NULL;
+       x86_pmu.commit_scheduling = NULL;
+       x86_pmu.stop_scheduling = NULL;
+
+       lockup_detector_resume();
+
+       get_online_cpus();
+
+       for_each_online_cpu(c) {
+               free_excl_cntrs(c);
+       }
+
+       put_online_cpus();
+       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+       return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c

new file mode 100644 (file)

index 0000000..93cb412
--- /dev/null
+++ b/arch/x86/events/intel/cqm.c
@@ -0,0 +1,1381 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC     0x0c8f
+#define MSR_IA32_QM_CTR                0x0c8e
+#define MSR_IA32_QM_EVTSEL     0x0c8d
+
+static u32 cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:              The cached Resource Monitoring ID
+ * @closid:            The cached Class Of Service ID
+ * @rmid_usecnt:       The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+       u32                     rmid;
+       u32                     closid;
+       int                     rmid_usecnt;
+};
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR         (1ULL << 63)
+#define RMID_VAL_UNAVAIL       (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static u32 intel_cqm_rotation_rmid;
+
+#define INVALID_RMID           (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(u32 rmid)
+{
+       if (!rmid || rmid == INVALID_RMID)
+               return false;
+
+       return true;
+}
+
+static u64 __rmid_read(u32 rmid)
+{
+       u64 val;
+
+       /*
+        * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+        * it just says that to increase confusion.
+        */
+       wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+       rdmsrl(MSR_IA32_QM_CTR, val);
+
+       /*
+        * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+        * the number of cachelines tagged with @rmid.
+        */
+       return val;
+}
+
+enum rmid_recycle_state {
+       RMID_YOUNG = 0,
+       RMID_AVAILABLE,
+       RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+       u32 rmid;
+       enum rmid_recycle_state state;
+       struct list_head list;
+       unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
+{
+       struct cqm_rmid_entry *entry;
+
+       entry = cqm_rmid_ptrs[rmid];
+       WARN_ON(entry->rmid != rmid);
+
+       return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static u32 __get_rmid(void)
+{
+       struct cqm_rmid_entry *entry;
+
+       lockdep_assert_held(&cache_mutex);
+
+       if (list_empty(&cqm_rmid_free_lru))
+               return INVALID_RMID;
+
+       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+       list_del(&entry->list);
+
+       return entry->rmid;
+}
+
+static void __put_rmid(u32 rmid)
+{
+       struct cqm_rmid_entry *entry;
+
+       lockdep_assert_held(&cache_mutex);
+
+       WARN_ON(!__rmid_valid(rmid));
+       entry = __rmid_entry(rmid);
+
+       entry->queue_time = jiffies;
+       entry->state = RMID_YOUNG;
+
+       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+       struct cqm_rmid_entry *entry;
+       unsigned int nr_rmids;
+       int r = 0;
+
+       nr_rmids = cqm_max_rmid + 1;
+       cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+                               nr_rmids, GFP_KERNEL);
+       if (!cqm_rmid_ptrs)
+               return -ENOMEM;
+
+       for (; r <= cqm_max_rmid; r++) {
+               struct cqm_rmid_entry *entry;
+
+               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+               if (!entry)
+                       goto fail;
+
+               INIT_LIST_HEAD(&entry->list);
+               entry->rmid = r;
+               cqm_rmid_ptrs[r] = entry;
+
+               list_add_tail(&entry->list, &cqm_rmid_free_lru);
+       }
+
+       /*
+        * RMID 0 is special and is always allocated. It's used for all
+        * tasks that are not monitored.
+        */
+       entry = __rmid_entry(0);
+       list_del(&entry->list);
+
+       mutex_lock(&cache_mutex);
+       intel_cqm_rotation_rmid = __get_rmid();
+       mutex_unlock(&cache_mutex);
+
+       return 0;
+fail:
+       while (r--)
+               kfree(cqm_rmid_ptrs[r]);
+
+       kfree(cqm_rmid_ptrs);
+       return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+       /* Per-cpu and task events don't mix */
+       if ((a->attach_state & PERF_ATTACH_TASK) !=
+           (b->attach_state & PERF_ATTACH_TASK))
+               return false;
+
+#ifdef CONFIG_CGROUP_PERF
+       if (a->cgrp != b->cgrp)
+               return false;
+#endif
+
+       /* If not task event, we're machine wide */
+       if (!(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Events that target same task are placed into the same cache group.
+        */
+       if (a->hw.target == b->hw.target)
+               return true;
+
+       /*
+        * Are we an inherited event?
+        */
+       if (b->parent == a)
+               return true;
+
+       return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return perf_cgroup_from_task(event->hw.target, event->ctx);
+
+       return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *                PROHIBITS
+ *     system-wide    ->       cgroup and task
+ *     cgroup        ->        system-wide
+ *                           ->        task in cgroup
+ *     task          ->        system-wide
+ *                           ->        task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+       /*
+        * We can have any number of cgroups but only one system-wide
+        * event at a time.
+        */
+       if (a->cgrp && b->cgrp) {
+               struct perf_cgroup *ac = a->cgrp;
+               struct perf_cgroup *bc = b->cgrp;
+
+               /*
+                * This condition should have been caught in
+                * __match_event() and we should be sharing an RMID.
+                */
+               WARN_ON_ONCE(ac == bc);
+
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+
+       if (a->cgrp || b->cgrp) {
+               struct perf_cgroup *ac, *bc;
+
+               /*
+                * cgroup and system-wide events are mutually exclusive
+                */
+               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+                       return true;
+
+               /*
+                * Ensure neither event is part of the other's cgroup
+                */
+               ac = event_to_cgroup(a);
+               bc = event_to_cgroup(b);
+               if (ac == bc)
+                       return true;
+
+               /*
+                * Must have cgroup and non-intersecting task events.
+                */
+               if (!ac || !bc)
+                       return false;
+
+               /*
+                * We have cgroup and task events, and the task belongs
+                * to a cgroup. Check for for overlap.
+                */
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+#endif
+       /*
+        * If one of them is not a task, same story as above with cgroups.
+        */
+       if (!(a->attach_state & PERF_ATTACH_TASK) ||
+           !(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Must be non-overlapping.
+        */
+       return false;
+}
+
+struct rmid_read {
+       u32 rmid;
+       atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
+{
+       struct perf_event *event;
+       struct list_head *head = &group->hw.cqm_group_entry;
+       u32 old_rmid = group->hw.cqm_rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       /*
+        * If our RMID is being deallocated, perform a read now.
+        */
+       if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+               struct rmid_read rr = {
+                       .value = ATOMIC64_INIT(0),
+                       .rmid = old_rmid,
+               };
+
+               on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+                                &rr, 1);
+               local64_set(&group->count, atomic64_read(&rr.value));
+       }
+
+       raw_spin_lock_irq(&cache_lock);
+
+       group->hw.cqm_rmid = rmid;
+       list_for_each_entry(event, head, hw.cqm_group_entry)
+               event->hw.cqm_rmid = rmid;
+
+       raw_spin_unlock_irq(&cache_lock);
+
+       return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+       struct cqm_rmid_entry *entry;
+
+       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+               if (entry->state != RMID_AVAILABLE)
+                       break;
+
+               if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+                       entry->state = RMID_DIRTY;
+       }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(u32 rmid)
+{
+       struct perf_event *leader, *event;
+
+       lockdep_assert_held(&cache_mutex);
+
+       leader = list_first_entry(&cache_groups, struct perf_event,
+                                 hw.cqm_groups_entry);
+       event = leader;
+
+       list_for_each_entry_continue(event, &cache_groups,
+                                    hw.cqm_groups_entry) {
+               if (__rmid_valid(event->hw.cqm_rmid))
+                       continue;
+
+               if (__conflict_event(event, leader))
+                       continue;
+
+               intel_cqm_xchg_rmid(event, rmid);
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250    /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+       struct cqm_rmid_entry *entry, *tmp;
+
+       lockdep_assert_held(&cache_mutex);
+
+       *available = 0;
+       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+               unsigned long min_queue_time;
+               unsigned long now = jiffies;
+
+               /*
+                * We hold RMIDs placed into limbo for a minimum queue
+                * time. Before the minimum queue time has elapsed we do
+                * not recycle RMIDs.
+                *
+                * The reasoning is that until a sufficient time has
+                * passed since we stopped using an RMID, any RMID
+                * placed onto the limbo list will likely still have
+                * data tagged in the cache, which means we'll probably
+                * fail to recycle it anyway.
+                *
+                * We can save ourselves an expensive IPI by skipping
+                * any RMIDs that have not been queued for the minimum
+                * time.
+                */
+               min_queue_time = entry->queue_time +
+                       msecs_to_jiffies(__rmid_queue_time_ms);
+
+               if (time_after(min_queue_time, now))
+                       break;
+
+               entry->state = RMID_AVAILABLE;
+               (*available)++;
+       }
+
+       /*
+        * Fast return if none of the RMIDs on the limbo list have been
+        * sitting on the queue for the minimum queue time.
+        */
+       if (!*available)
+               return false;
+
+       /*
+        * Test whether an RMID is free for each package.
+        */
+       on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+               /*
+                * Exhausted all RMIDs that have waited min queue time.
+                */
+               if (entry->state == RMID_YOUNG)
+                       break;
+
+               if (entry->state == RMID_DIRTY)
+                       continue;
+
+               list_del(&entry->list); /* remove from limbo */
+
+               /*
+                * The rotation RMID gets priority if it's
+                * currently invalid. In which case, skip adding
+                * the RMID to the the free lru.
+                */
+               if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+                       intel_cqm_rotation_rmid = entry->rmid;
+                       continue;
+               }
+
+               /*
+                * If we have groups waiting for RMIDs, hand
+                * them one now provided they don't conflict.
+                */
+               if (intel_cqm_sched_in_event(entry->rmid))
+                       continue;
+
+               /*
+                * Otherwise place it onto the free list.
+                */
+               list_add_tail(&entry->list, &cqm_rmid_free_lru);
+       }
+
+
+       return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+       struct perf_event *rotor;
+       u32 rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       rotor = list_first_entry(&cache_groups, struct perf_event,
+                                hw.cqm_groups_entry);
+
+       /*
+        * The group at the front of the list should always have a valid
+        * RMID. If it doesn't then no groups have RMIDs assigned and we
+        * don't need to rotate the list.
+        */
+       if (next == rotor)
+               return;
+
+       rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+       __put_rmid(rmid);
+
+       list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+       struct perf_event *group, *g;
+       u32 rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+               if (group == event)
+                       continue;
+
+               rmid = group->hw.cqm_rmid;
+
+               /*
+                * Skip events that don't have a valid RMID.
+                */
+               if (!__rmid_valid(rmid))
+                       continue;
+
+               /*
+                * No conflict? No problem! Leave the event alone.
+                */
+               if (!__conflict_event(group, event))
+                       continue;
+
+               intel_cqm_xchg_rmid(group, INVALID_RMID);
+               __put_rmid(rmid);
+       }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+       struct perf_event *group, *start = NULL;
+       unsigned int threshold_limit;
+       unsigned int nr_needed = 0;
+       unsigned int nr_available;
+       bool rotated = false;
+
+       mutex_lock(&cache_mutex);
+
+again:
+       /*
+        * Fast path through this function if there are no groups and no
+        * RMIDs that need cleaning.
+        */
+       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+               goto out;
+
+       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+               if (!__rmid_valid(group->hw.cqm_rmid)) {
+                       if (!start)
+                               start = group;
+                       nr_needed++;
+               }
+       }
+
+       /*
+        * We have some event groups, but they all have RMIDs assigned
+        * and no RMIDs need cleaning.
+        */
+       if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+               goto out;
+
+       if (!nr_needed)
+               goto stabilize;
+
+       /*
+        * We have more event groups without RMIDs than available RMIDs,
+        * or we have event groups that conflict with the ones currently
+        * scheduled.
+        *
+        * We force deallocate the rmid of the group at the head of
+        * cache_groups. The first event group without an RMID then gets
+        * assigned intel_cqm_rotation_rmid. This ensures we always make
+        * forward progress.
+        *
+        * Rotate the cache_groups list so the previous head is now the
+        * tail.
+        */
+       __intel_cqm_pick_and_rotate(start);
+
+       /*
+        * If the rotation is going to succeed, reduce the threshold so
+        * that we don't needlessly reuse dirty RMIDs.
+        */
+       if (__rmid_valid(intel_cqm_rotation_rmid)) {
+               intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+               intel_cqm_rotation_rmid = __get_rmid();
+
+               intel_cqm_sched_out_conflicting_events(start);
+
+               if (__intel_cqm_threshold)
+                       __intel_cqm_threshold--;
+       }
+
+       rotated = true;
+
+stabilize:
+       /*
+        * We now need to stablize the RMID we freed above (if any) to
+        * ensure that the next time we rotate we have an RMID with zero
+        * occupancy value.
+        *
+        * Alternatively, if we didn't need to perform any rotation,
+        * we'll have a bunch of RMIDs in limbo that need stabilizing.
+        */
+       threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+       while (intel_cqm_rmid_stabilize(&nr_available) &&
+              __intel_cqm_threshold < threshold_limit) {
+               unsigned int steal_limit;
+
+               /*
+                * Don't spin if nobody is actively waiting for an RMID,
+                * the rotation worker will be kicked as soon as an
+                * event needs an RMID anyway.
+                */
+               if (!nr_needed)
+                       break;
+
+               /* Allow max 25% of RMIDs to be in limbo. */
+               steal_limit = (cqm_max_rmid + 1) / 4;
+
+               /*
+                * We failed to stabilize any RMIDs so our rotation
+                * logic is now stuck. In order to make forward progress
+                * we have a few options:
+                *
+                *   1. rotate ("steal") another RMID
+                *   2. increase the threshold
+                *   3. do nothing
+                *
+                * We do both of 1. and 2. until we hit the steal limit.
+                *
+                * The steal limit prevents all RMIDs ending up on the
+                * limbo list. This can happen if every RMID has a
+                * non-zero occupancy above threshold_limit, and the
+                * occupancy values aren't dropping fast enough.
+                *
+                * Note that there is prioritisation at work here - we'd
+                * rather increase the number of RMIDs on the limbo list
+                * than increase the threshold, because increasing the
+                * threshold skews the event data (because we reuse
+                * dirty RMIDs) - threshold bumps are a last resort.
+                */
+               if (nr_available < steal_limit)
+                       goto again;
+
+               __intel_cqm_threshold++;
+       }
+
+out:
+       mutex_unlock(&cache_mutex);
+       return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+       unsigned long delay;
+
+       __intel_cqm_rmid_rotate();
+
+       delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+       schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+                                 struct perf_event **group)
+{
+       struct perf_event *iter;
+       bool conflict = false;
+       u32 rmid;
+
+       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+               rmid = iter->hw.cqm_rmid;
+
+               if (__match_event(iter, event)) {
+                       /* All tasks in a group share an RMID */
+                       event->hw.cqm_rmid = rmid;
+                       *group = iter;
+                       return;
+               }
+
+               /*
+                * We only care about conflicts for events that are
+                * actually scheduled in (and hence have a valid RMID).
+                */
+               if (__conflict_event(iter, event) && __rmid_valid(rmid))
+                       conflict = true;
+       }
+
+       if (conflict)
+               rmid = INVALID_RMID;
+       else
+               rmid = __get_rmid();
+
+       event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+       unsigned long flags;
+       u32 rmid;
+       u64 val;
+
+       /*
+        * Task events are handled by intel_cqm_event_count().
+        */
+       if (event->cpu == -1)
+               return;
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+       rmid = event->hw.cqm_rmid;
+
+       if (!__rmid_valid(rmid))
+               goto out;
+
+       val = __rmid_read(rmid);
+
+       /*
+        * Ignore this reading on error states and do not update the value.
+        */
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               goto out;
+
+       local64_set(&event->count, val);
+out:
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+       struct rmid_read *rr = info;
+       u64 val;
+
+       val = __rmid_read(rr->rmid);
+
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               return;
+
+       atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+       return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+       unsigned long flags;
+       struct rmid_read rr = {
+               .value = ATOMIC64_INIT(0),
+       };
+
+       /*
+        * We only need to worry about task events. System-wide events
+        * are handled like usual, i.e. entirely with
+        * intel_cqm_event_read().
+        */
+       if (event->cpu != -1)
+               return __perf_event_count(event);
+
+       /*
+        * Only the group leader gets to report values. This stops us
+        * reporting duplicate values to userspace, and gives us a clear
+        * rule for which task gets to report the values.
+        *
+        * Note that it is impossible to attribute these values to
+        * specific packages - we forfeit that ability when we create
+        * task events.
+        */
+       if (!cqm_group_leader(event))
+               return 0;
+
+       /*
+        * Getting up-to-date values requires an SMP IPI which is not
+        * possible if we're being called in interrupt context. Return
+        * the cached values instead.
+        */
+       if (unlikely(in_interrupt()))
+               goto out;
+
+       /*
+        * Notice that we don't perform the reading of an RMID
+        * atomically, because we can't hold a spin lock across the
+        * IPIs.
+        *
+        * Speculatively perform the read, since @event might be
+        * assigned a different (possibly invalid) RMID while we're
+        * busying performing the IPI calls. It's therefore necessary to
+        * check @event's RMID afterwards, and if it has changed,
+        * discard the result of the read.
+        */
+       rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+       if (!__rmid_valid(rr.rmid))
+               goto out;
+
+       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+       if (event->hw.cqm_rmid == rr.rmid)
+               local64_set(&event->count, atomic64_read(&rr.value));
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+       return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+       u32 rmid = event->hw.cqm_rmid;
+
+       if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+               return;
+
+       event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+       if (state->rmid_usecnt++) {
+               if (!WARN_ON_ONCE(state->rmid != rmid))
+                       return;
+       } else {
+               WARN_ON_ONCE(state->rmid);
+       }
+
+       state->rmid = rmid;
+       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+       if (event->hw.cqm_state & PERF_HES_STOPPED)
+               return;
+
+       event->hw.cqm_state |= PERF_HES_STOPPED;
+
+       intel_cqm_event_read(event);
+
+       if (!--state->rmid_usecnt) {
+               state->rmid = 0;
+               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
+       } else {
+               WARN_ON_ONCE(!state->rmid);
+       }
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+       unsigned long flags;
+       u32 rmid;
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+
+       event->hw.cqm_state = PERF_HES_STOPPED;
+       rmid = event->hw.cqm_rmid;
+
+       if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+               intel_cqm_event_start(event, mode);
+
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+       return 0;
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+       struct perf_event *group_other = NULL;
+
+       mutex_lock(&cache_mutex);
+
+       /*
+        * If there's another event in this group...
+        */
+       if (!list_empty(&event->hw.cqm_group_entry)) {
+               group_other = list_first_entry(&event->hw.cqm_group_entry,
+                                              struct perf_event,
+                                              hw.cqm_group_entry);
+               list_del(&event->hw.cqm_group_entry);
+       }
+
+       /*
+        * And we're the group leader..
+        */
+       if (cqm_group_leader(event)) {
+               /*
+                * If there was a group_other, make that leader, otherwise
+                * destroy the group and return the RMID.
+                */
+               if (group_other) {
+                       list_replace(&event->hw.cqm_groups_entry,
+                                    &group_other->hw.cqm_groups_entry);
+               } else {
+                       u32 rmid = event->hw.cqm_rmid;
+
+                       if (__rmid_valid(rmid))
+                               __put_rmid(rmid);
+                       list_del(&event->hw.cqm_groups_entry);
+               }
+       }
+
+       mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+       struct perf_event *group = NULL;
+       bool rotate = false;
+
+       if (event->attr.type != intel_cqm_pmu.type)
+               return -ENOENT;
+
+       if (event->attr.config & ~QOS_EVENT_MASK)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+       INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+       event->destroy = intel_cqm_event_destroy;
+
+       mutex_lock(&cache_mutex);
+
+       /* Will also set rmid */
+       intel_cqm_setup_event(event, &group);
+
+       if (group) {
+               list_add_tail(&event->hw.cqm_group_entry,
+                             &group->hw.cqm_group_entry);
+       } else {
+               list_add_tail(&event->hw.cqm_groups_entry,
+                             &cache_groups);
+
+               /*
+                * All RMIDs are either in use or have recently been
+                * used. Kick the rotation worker to clean/free some.
+                *
+                * We only do this for the group leader, rather than for
+                * every event in a group to save on needless work.
+                */
+               if (!__rmid_valid(event->hw.cqm_rmid))
+                       rotate = true;
+       }
+
+       mutex_unlock(&cache_mutex);
+
+       if (rotate)
+               schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+       return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+       EVENT_PTR(intel_cqm_llc),
+       EVENT_PTR(intel_cqm_llc_pkg),
+       EVENT_PTR(intel_cqm_llc_unit),
+       EVENT_PTR(intel_cqm_llc_scale),
+       EVENT_PTR(intel_cqm_llc_snapshot),
+       NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+       .name = "events",
+       .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+       .name = "format",
+       .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+                          char *page)
+{
+       ssize_t rv;
+
+       mutex_lock(&cache_mutex);
+       rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+       mutex_unlock(&cache_mutex);
+
+       return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+                           struct device_attribute *attr,
+                           const char *buf, size_t count)
+{
+       unsigned int bytes, cachelines;
+       int ret;
+
+       ret = kstrtouint(buf, 0, &bytes);
+       if (ret)
+               return ret;
+
+       mutex_lock(&cache_mutex);
+
+       __intel_cqm_max_threshold = bytes;
+       cachelines = bytes / cqm_l3_scale;
+
+       /*
+        * The new maximum takes effect immediately.
+        */
+       if (__intel_cqm_threshold > cachelines)
+               __intel_cqm_threshold = cachelines;
+
+       mutex_unlock(&cache_mutex);
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+       &dev_attr_max_recycle_threshold.attr,
+       NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+       .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+       &intel_cqm_events_group,
+       &intel_cqm_format_group,
+       &intel_cqm_group,
+       NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+       .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+       .attr_groups         = intel_cqm_attr_groups,
+       .task_ctx_nr         = perf_sw_context,
+       .event_init          = intel_cqm_event_init,
+       .add                 = intel_cqm_event_add,
+       .del                 = intel_cqm_event_stop,
+       .start               = intel_cqm_event_start,
+       .stop                = intel_cqm_event_stop,
+       .read                = intel_cqm_event_read,
+       .count               = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+       int reader;
+
+       /* First online cpu in package becomes the reader */
+       reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
+       if (reader >= nr_cpu_ids)
+               cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+       state->rmid = 0;
+       state->closid = 0;
+       state->rmid_usecnt = 0;
+
+       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+       int target;
+
+       /* Is @cpu the current cqm reader for this package ? */
+       if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+               return;
+
+       /* Find another online reader in this package */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &cqm_cpumask);
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu  = (unsigned long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               intel_cqm_cpu_exit(cpu);
+               break;
+       case CPU_STARTING:
+               intel_cqm_cpu_starting(cpu);
+               cqm_pick_event_reader(cpu);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+       {}
+};
+
+static int __init intel_cqm_init(void)
+{
+       char *str, scale[20];
+       int i, cpu, ret;
+
+       if (!x86_match_cpu(intel_cqm_match))
+               return -ENODEV;
+
+       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+       /*
+        * It's possible that not all resources support the same number
+        * of RMIDs. Instead of making scheduling much more complicated
+        * (where we have to match a task's RMID to a cpu that supports
+        * that many RMIDs) just find the minimum RMIDs supported across
+        * all cpus.
+        *
+        * Also, check that the scales match on all cpus.
+        */
+       cpu_notifier_register_begin();
+
+       for_each_online_cpu(cpu) {
+               struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+               if (c->x86_cache_max_rmid < cqm_max_rmid)
+                       cqm_max_rmid = c->x86_cache_max_rmid;
+
+               if (c->x86_cache_occ_scale != cqm_l3_scale) {
+                       pr_err("Multiple LLC scale values, disabling\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * A reasonable upper limit on the max threshold is the number
+        * of lines tagged per RMID if all RMIDs have the same number of
+        * lines tagged in the LLC.
+        *
+        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+        */
+       __intel_cqm_max_threshold =
+               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+       str = kstrdup(scale, GFP_KERNEL);
+       if (!str) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       event_attr_intel_cqm_llc_scale.event_str = str;
+
+       ret = intel_cqm_setup_rmid_cache();
+       if (ret)
+               goto out;
+
+       for_each_online_cpu(i) {
+               intel_cqm_cpu_starting(i);
+               cqm_pick_event_reader(i);
+       }
+
+       __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+       if (ret)
+               pr_err("Intel CQM perf registration failed: %d\n", ret);
+       else
+               pr_info("Intel CQM monitoring enabled\n");
+
+out:
+       cpu_notifier_register_done();
+
+       return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c

new file mode 100644 (file)

index 0000000..7946c42
--- /dev/null
+++ b/arch/x86/events/intel/cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ *  - 'cstate_core': The counter is available for each physical core.
+ *    The counters include CORE_C*_RESIDENCY.
+ *  - 'cstate_pkg': The counter is available for each physical package.
+ *    The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ *     MSR_CORE_C1_RES: CORE C1 Residency Counter
+ *                      perf code: 0x00
+ *                      Available model: SLM,AMT
+ *                      Scope: Core (each processor core has a MSR)
+ *     MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ *                            perf code: 0x01
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ *                            perf code: 0x02
+ *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ *                            perf code: 0x03
+ *                            Available model: SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
+ *                            perf code: 0x00
+ *                            Available model: SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
+ *                            perf code: 0x01
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
+ *                            perf code: 0x02
+ *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
+ *                            perf code: 0x03
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
+ *                            perf code: 0x04
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
+ *                            perf code: 0x05
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ *                            perf code: 0x06
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)                \
+static ssize_t __cstate_##_var##_show(struct kobject *kobj,    \
+                               struct kobj_attribute *attr,    \
+                               char *page)                     \
+{                                                              \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+       return sprintf(page, _format "\n");                     \
+}                                                              \
+static struct kobj_attribute format_attr_##_var =              \
+       __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf);
+
+struct perf_cstate_msr {
+       u64     msr;
+       struct  perf_pmu_events_attr *attr;
+       bool    (*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+       /*
+        * cstate_core events
+        */
+       PERF_CSTATE_CORE_C1_RES = 0,
+       PERF_CSTATE_CORE_C3_RES,
+       PERF_CSTATE_CORE_C6_RES,
+       PERF_CSTATE_CORE_C7_RES,
+
+       PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES ||
+                   idx == PERF_CSTATE_CORE_C7_RES)
+                       return true;
+               break;
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_CSTATE_CORE_C1_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+       [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,          &evattr_cstate_core_c1, test_core, },
+       [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,    &evattr_cstate_core_c3, test_core, },
+       [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,    &evattr_cstate_core_c6, test_core, },
+       [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,    &evattr_cstate_core_c7, test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+       .name = "events",
+       .attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+       &format_attr_core_event.attr,
+       NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+       .name = "format",
+       .attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+       .attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+       &core_events_attr_group,
+       &core_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+       /*
+        * cstate_pkg events
+        */
+       PERF_CSTATE_PKG_C2_RES = 0,
+       PERF_CSTATE_PKG_C3_RES,
+       PERF_CSTATE_PKG_C6_RES,
+       PERF_CSTATE_PKG_C7_RES,
+       PERF_CSTATE_PKG_C8_RES,
+       PERF_CSTATE_PKG_C9_RES,
+       PERF_CSTATE_PKG_C10_RES,
+
+       PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES ||
+                   idx == PERF_CSTATE_CORE_C7_RES)
+                       return true;
+               break;
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_CSTATE_PKG_C2_RES ||
+                   idx == PERF_CSTATE_PKG_C3_RES ||
+                   idx == PERF_CSTATE_PKG_C6_RES ||
+                   idx == PERF_CSTATE_PKG_C7_RES)
+                       return true;
+               break;
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       case 69: /* 22nm Haswell ULT */
+               if (idx == PERF_CSTATE_PKG_C2_RES ||
+                   idx == PERF_CSTATE_PKG_C3_RES ||
+                   idx == PERF_CSTATE_PKG_C6_RES ||
+                   idx == PERF_CSTATE_PKG_C7_RES ||
+                   idx == PERF_CSTATE_PKG_C8_RES ||
+                   idx == PERF_CSTATE_PKG_C9_RES ||
+                   idx == PERF_CSTATE_PKG_C10_RES)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+       [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,      &evattr_cstate_pkg_c2,  test_pkg, },
+       [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,      &evattr_cstate_pkg_c3,  test_pkg, },
+       [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,      &evattr_cstate_pkg_c6,  test_pkg, },
+       [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,      &evattr_cstate_pkg_c7,  test_pkg, },
+       [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,      &evattr_cstate_pkg_c8,  test_pkg, },
+       [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,      &evattr_cstate_pkg_c9,  test_pkg, },
+       [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,    &evattr_cstate_pkg_c10, test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+       .name = "events",
+       .attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+       &format_attr_pkg_event.attr,
+       NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+       .name = "format",
+       .attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+       &pkg_events_attr_group,
+       &pkg_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       if (pmu == &cstate_core_pmu)
+               return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+       else if (pmu == &cstate_pkg_pmu)
+               return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+       else
+               return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config;
+       int ret = 0;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       if (event->pmu == &cstate_core_pmu) {
+               if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+                       return -EINVAL;
+               if (!core_msr[cfg].attr)
+                       return -EINVAL;
+               event->hw.event_base = core_msr[cfg].msr;
+       } else if (event->pmu == &cstate_pkg_pmu) {
+               if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+                       return -EINVAL;
+               if (!pkg_msr[cfg].attr)
+                       return -EINVAL;
+               event->hw.event_base = pkg_msr[cfg].msr;
+       } else
+               return -ENOENT;
+
+       /* must be done before validate_group */
+       event->hw.config = cfg;
+       event->hw.idx = -1;
+
+       return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+       u64 val;
+
+       rdmsrl(event->hw.event_base, val);
+       return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev_raw_count, new_raw_count;
+
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       new_raw_count = cstate_pmu_read_counter(event);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                           new_raw_count) != prev_raw_count)
+               goto again;
+
+       local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+       local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+       cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+       cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+       if (mode & PERF_EF_START)
+               cstate_pmu_event_start(event, mode);
+
+       return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+       int i, id, target;
+
+       /* cpu exit for cstate core */
+       if (has_cstate_core) {
+               id = topology_core_id(cpu);
+               target = -1;
+
+               for_each_online_cpu(i) {
+                       if (i == cpu)
+                               continue;
+                       if (id == topology_core_id(i)) {
+                               target = i;
+                               break;
+                       }
+               }
+               if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+                       cpumask_set_cpu(target, &cstate_core_cpu_mask);
+               WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+               if (target >= 0)
+                       perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+       }
+
+       /* cpu exit for cstate pkg */
+       if (has_cstate_pkg) {
+               id = topology_physical_package_id(cpu);
+               target = -1;
+
+               for_each_online_cpu(i) {
+                       if (i == cpu)
+                               continue;
+                       if (id == topology_physical_package_id(i)) {
+                               target = i;
+                               break;
+                       }
+               }
+               if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+                       cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+               WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+               if (target >= 0)
+                       perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+       }
+}
+
+static void cstate_cpu_init(int cpu)
+{
+       int i, id;
+
+       /* cpu init for cstate core */
+       if (has_cstate_core) {
+               id = topology_core_id(cpu);
+               for_each_cpu(i, &cstate_core_cpu_mask) {
+                       if (id == topology_core_id(i))
+                               break;
+               }
+               if (i >= nr_cpu_ids)
+                       cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+       }
+
+       /* cpu init for cstate pkg */
+       if (has_cstate_pkg) {
+               id = topology_physical_package_id(cpu);
+               for_each_cpu(i, &cstate_pkg_cpu_mask) {
+                       if (id == topology_physical_package_id(i))
+                               break;
+               }
+               if (i >= nr_cpu_ids)
+                       cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+       }
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               break;
+       case CPU_STARTING:
+               cstate_cpu_init(cpu);
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_DYING:
+               break;
+       case CPU_ONLINE:
+       case CPU_DEAD:
+               break;
+       case CPU_DOWN_PREPARE:
+               cstate_cpu_exit(cpu);
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+                            struct attribute   **events_attrs,
+                            int max_event_nr)
+{
+       int i, j = 0;
+       u64 val;
+
+       /* Probe the cstate events. */
+       for (i = 0; i < max_event_nr; i++) {
+               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+                       msr[i].attr = NULL;
+       }
+
+       /* List remaining events in the sysfs attrs. */
+       for (i = 0; i < max_event_nr; i++) {
+               if (msr[i].attr)
+                       events_attrs[j++] = &msr[i].attr->attr.attr;
+       }
+       events_attrs[j] = NULL;
+
+       return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+       /* SLM has different MSR for PKG C6 */
+       switch (boot_cpu_data.x86_model) {
+       case 55:
+       case 76:
+       case 77:
+               pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+       }
+
+       if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+               has_cstate_core = true;
+
+       if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+               has_cstate_pkg = true;
+
+       return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+       int cpu;
+
+       cpu_notifier_register_begin();
+
+       for_each_online_cpu(cpu)
+               cstate_cpu_init(cpu);
+
+       __perf_cpu_notifier(cstate_cpu_notifier);
+
+       cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+       .attr_groups    = core_attr_groups,
+       .name           = "cstate_core",
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = cstate_pmu_event_init,
+       .add            = cstate_pmu_event_add, /* must have */
+       .del            = cstate_pmu_event_del, /* must have */
+       .start          = cstate_pmu_event_start,
+       .stop           = cstate_pmu_event_stop,
+       .read           = cstate_pmu_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+       .attr_groups    = pkg_attr_groups,
+       .name           = "cstate_pkg",
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = cstate_pmu_event_init,
+       .add            = cstate_pmu_event_add, /* must have */
+       .del            = cstate_pmu_event_del, /* must have */
+       .start          = cstate_pmu_event_start,
+       .stop           = cstate_pmu_event_stop,
+       .read           = cstate_pmu_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+       int err;
+
+       if (has_cstate_core) {
+               err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+               if (WARN_ON(err))
+                       pr_info("Failed to register PMU %s error %d\n",
+                               cstate_core_pmu.name, err);
+       }
+
+       if (has_cstate_pkg) {
+               err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+               if (WARN_ON(err))
+                       pr_info("Failed to register PMU %s error %d\n",
+                               cstate_pkg_pmu.name, err);
+       }
+}
+
+static int __init cstate_pmu_init(void)
+{
+       int err;
+
+       if (cpu_has_hypervisor)
+               return -ENODEV;
+
+       err = cstate_init();
+       if (err)
+               return err;
+
+       cstate_cpumask_init();
+
+       cstate_pmus_register();
+
+       return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c

new file mode 100644 (file)

index 0000000..ce7211a
--- /dev/null
+++ b/arch/x86/events/intel/ds.c
@@ -0,0 +1,1410 @@
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE                24
+
+#define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
+#define PEBS_FIXUP_SIZE                PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+       u32 flags, ip;
+       u32 ax, bc, cx, dx;
+       u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+       u64 val;
+       struct {
+               unsigned int ld_dse:4;
+               unsigned int ld_stlb_miss:1;
+               unsigned int ld_locked:1;
+               unsigned int ld_reserved:26;
+       };
+       struct {
+               unsigned int st_l1d_hit:1;
+               unsigned int st_reserved1:3;
+               unsigned int st_stlb_miss:1;
+               unsigned int st_locked:1;
+               unsigned int st_reserved2:26;
+       };
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
+       P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+       OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
+       OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
+       OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
+       OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
+       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+       OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
+       OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+       OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
+       OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
+       OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
+       OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+       pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
+       pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+       pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+}
+
+static u64 precise_store_data(u64 status)
+{
+       union intel_x86_pebs_dse dse;
+       u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+       dse.val = status;
+
+       /*
+        * bit 4: TLB access
+        * 1 = stored missed 2nd level TLB
+        *
+        * so it either hit the walker or the OS
+        * otherwise hit 2nd level TLB
+        */
+       if (dse.st_stlb_miss)
+               val |= P(TLB, MISS);
+       else
+               val |= P(TLB, HIT);
+
+       /*
+        * bit 0: hit L1 data cache
+        * if not set, then all we know is that
+        * it missed L1D
+        */
+       if (dse.st_l1d_hit)
+               val |= P(LVL, HIT);
+       else
+               val |= P(LVL, MISS);
+
+       /*
+        * bit 5: Locked prefix
+        */
+       if (dse.st_locked)
+               val |= P(LOCK, LOCKED);
+
+       return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+       union perf_mem_data_src dse;
+
+       dse.val = PERF_MEM_NA;
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+               dse.mem_op = PERF_MEM_OP_STORE;
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+               dse.mem_op = PERF_MEM_OP_LOAD;
+
+       /*
+        * L1 info only valid for following events:
+        *
+        * MEM_UOPS_RETIRED.STLB_MISS_STORES
+        * MEM_UOPS_RETIRED.LOCK_STORES
+        * MEM_UOPS_RETIRED.SPLIT_STORES
+        * MEM_UOPS_RETIRED.ALL_STORES
+        */
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+               if (status & 1)
+                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+               else
+                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+       }
+       return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+       union intel_x86_pebs_dse dse;
+       u64 val;
+       int model = boot_cpu_data.x86_model;
+       int fam = boot_cpu_data.x86;
+
+       dse.val = status;
+
+       /*
+        * use the mapping table for bit 0-3
+        */
+       val = pebs_data_source[dse.ld_dse];
+
+       /*
+        * Nehalem models do not support TLB, Lock infos
+        */
+       if (fam == 0x6 && (model == 26 || model == 30
+           || model == 31 || model == 46)) {
+               val |= P(TLB, NA) | P(LOCK, NA);
+               return val;
+       }
+       /*
+        * bit 4: TLB access
+        * 0 = did not miss 2nd level TLB
+        * 1 = missed 2nd level TLB
+        */
+       if (dse.ld_stlb_miss)
+               val |= P(TLB, MISS) | P(TLB, L2);
+       else
+               val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+       /*
+        * bit 5: locked prefix
+        */
+       if (dse.ld_locked)
+               val |= P(LOCK, LOCKED);
+
+       return val;
+}
+
+struct pebs_record_core {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+       struct {
+               u32 cycles_last_block     : 32,
+                   hle_abort             : 1,
+                   rtm_abort             : 1,
+                   instruction_abort     : 1,
+                   non_instruction_abort : 1,
+                   retry                 : 1,
+                   data_conflict         : 1,
+                   capacity_writes       : 1,
+                   capacity_reads        : 1;
+       };
+       u64         value;
+};
+
+#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+       u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                    (u32)((u64)(unsigned long)ds),
+                    (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+       if (!per_cpu(cpu_hw_events, cpu).ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static int alloc_pebs_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       int node = cpu_to_node(cpu);
+       int max;
+       void *buffer, *ibuffer;
+
+       if (!x86_pmu.pebs)
+               return 0;
+
+       buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+       if (unlikely(!buffer))
+               return -ENOMEM;
+
+       /*
+        * HSW+ already provides us the eventing ip; no need to allocate this
+        * buffer then.
+        */
+       if (x86_pmu.intel_cap.pebs_format < 2) {
+               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+               if (!ibuffer) {
+                       kfree(buffer);
+                       return -ENOMEM;
+               }
+               per_cpu(insn_buffer, cpu) = ibuffer;
+       }
+
+       max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
+
+       ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+       ds->pebs_index = ds->pebs_buffer_base;
+       ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+               max * x86_pmu.pebs_record_size;
+
+       return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds || !x86_pmu.pebs)
+               return;
+
+       kfree(per_cpu(insn_buffer, cpu));
+       per_cpu(insn_buffer, cpu) = NULL;
+
+       kfree((void *)(unsigned long)ds->pebs_buffer_base);
+       ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       int node = cpu_to_node(cpu);
+       int max, thresh;
+       void *buffer;
+
+       if (!x86_pmu.bts)
+               return 0;
+
+       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+       if (unlikely(!buffer)) {
+               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+               return -ENOMEM;
+       }
+
+       max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+       thresh = max / 16;
+
+       ds->bts_buffer_base = (u64)(unsigned long)buffer;
+       ds->bts_index = ds->bts_buffer_base;
+       ds->bts_absolute_maximum = ds->bts_buffer_base +
+               max * BTS_RECORD_SIZE;
+       ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+               thresh * BTS_RECORD_SIZE;
+
+       return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds || !x86_pmu.bts)
+               return;
+
+       kfree((void *)(unsigned long)ds->bts_buffer_base);
+       ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+       int node = cpu_to_node(cpu);
+       struct debug_store *ds;
+
+       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+       if (unlikely(!ds))
+               return -ENOMEM;
+
+       per_cpu(cpu_hw_events, cpu).ds = ds;
+
+       return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       per_cpu(cpu_hw_events, cpu).ds = NULL;
+       kfree(ds);
+}
+
+void release_ds_buffers(void)
+{
+       int cpu;
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               fini_debug_store_on_cpu(cpu);
+
+       for_each_possible_cpu(cpu) {
+               release_pebs_buffer(cpu);
+               release_bts_buffer(cpu);
+               release_ds_buffer(cpu);
+       }
+       put_online_cpus();
+}
+
+void reserve_ds_buffers(void)
+{
+       int bts_err = 0, pebs_err = 0;
+       int cpu;
+
+       x86_pmu.bts_active = 0;
+       x86_pmu.pebs_active = 0;
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       if (!x86_pmu.bts)
+               bts_err = 1;
+
+       if (!x86_pmu.pebs)
+               pebs_err = 1;
+
+       get_online_cpus();
+
+       for_each_possible_cpu(cpu) {
+               if (alloc_ds_buffer(cpu)) {
+                       bts_err = 1;
+                       pebs_err = 1;
+               }
+
+               if (!bts_err && alloc_bts_buffer(cpu))
+                       bts_err = 1;
+
+               if (!pebs_err && alloc_pebs_buffer(cpu))
+                       pebs_err = 1;
+
+               if (bts_err && pebs_err)
+                       break;
+       }
+
+       if (bts_err) {
+               for_each_possible_cpu(cpu)
+                       release_bts_buffer(cpu);
+       }
+
+       if (pebs_err) {
+               for_each_possible_cpu(cpu)
+                       release_pebs_buffer(cpu);
+       }
+
+       if (bts_err && pebs_err) {
+               for_each_possible_cpu(cpu)
+                       release_ds_buffer(cpu);
+       } else {
+               if (x86_pmu.bts && !bts_err)
+                       x86_pmu.bts_active = 1;
+
+               if (x86_pmu.pebs && !pebs_err)
+                       x86_pmu.pebs_active = 1;
+
+               for_each_online_cpu(cpu)
+                       init_debug_store_on_cpu(cpu);
+       }
+
+       put_online_cpus();
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+       EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+       unsigned long debugctlmsr;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr |= DEBUGCTLMSR_TR;
+       debugctlmsr |= DEBUGCTLMSR_BTS;
+       if (config & ARCH_PERFMON_EVENTSEL_INT)
+               debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       unsigned long debugctlmsr;
+
+       if (!cpuc->ds)
+               return;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr &=
+               ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+                 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct bts_record {
+               u64     from;
+               u64     to;
+               u64     flags;
+       };
+       struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+       struct bts_record *at, *base, *top;
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+       struct perf_sample_data data;
+       unsigned long skip = 0;
+       struct pt_regs regs;
+
+       if (!event)
+               return 0;
+
+       if (!x86_pmu.bts_active)
+               return 0;
+
+       base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+       top  = (struct bts_record *)(unsigned long)ds->bts_index;
+
+       if (top <= base)
+               return 0;
+
+       memset(&regs, 0, sizeof(regs));
+
+       ds->bts_index = ds->bts_buffer_base;
+
+       perf_sample_data_init(&data, 0, event->hw.last_period);
+
+       /*
+        * BTS leaks kernel addresses in branches across the cpl boundary,
+        * such as traps or system calls, so unless the user is asking for
+        * kernel tracing (and right now it's not possible), we'd need to
+        * filter them out. But first we need to count how many of those we
+        * have in the current batch. This is an extra O(n) pass, however,
+        * it's much faster than the other one especially considering that
+        * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+        * alloc_bts_buffer()).
+        */
+       for (at = base; at < top; at++) {
+               /*
+                * Note that right now *this* BTS code only works if
+                * attr::exclude_kernel is set, but let's keep this extra
+                * check here in case that changes.
+                */
+               if (event->attr.exclude_kernel &&
+                   (kernel_ip(at->from) || kernel_ip(at->to)))
+                       skip++;
+       }
+
+       /*
+        * Prepare a generic sample, i.e. fill in the invariant fields.
+        * We will overwrite the from and to address before we output
+        * the sample.
+        */
+       perf_prepare_sample(&header, &data, event, &regs);
+
+       if (perf_output_begin(&handle, event, header.size *
+                             (top - base - skip)))
+               return 1;
+
+       for (at = base; at < top; at++) {
+               /* Filter out any records that contain kernel addresses. */
+               if (event->attr.exclude_kernel &&
+                   (kernel_ip(at->from) || kernel_ip(at->to)))
+                       continue;
+
+               data.ip         = at->from;
+               data.addr       = at->to;
+
+               perf_output_sample(&handle, &header, &data, event);
+       }
+
+       perf_output_end(&handle);
+
+       /* There's new data available. */
+       event->hw.interrupts++;
+       event->pending_kill = POLL_IN;
+       return 1;
+}
+
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+       struct pt_regs regs;
+
+       x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (!sched_in)
+               intel_pmu_drain_pebs_buffer();
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
+       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
+       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+        EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       if (!event->attr.precise_ip)
+               return NULL;
+
+       if (x86_pmu.pebs_constraints) {
+               for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+                       if ((event->hw.config & c->cmask) == c->code) {
+                               event->hw.flags |= c->flags;
+                               return c;
+                       }
+               }
+       }
+
+       return &emptyconstraint;
+}
+
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool first_pebs;
+       u64 threshold;
+
+       hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+       first_pebs = !pebs_is_enabled(cpuc);
+       cpuc->pebs_enabled |= 1ULL << hwc->idx;
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+               cpuc->pebs_enabled |= 1ULL << 63;
+
+       /*
+        * When the event is constrained enough we can use a larger
+        * threshold and run the event with less frequent PMI.
+        */
+       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+               threshold = ds->pebs_absolute_maximum -
+                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+               if (first_pebs)
+                       perf_sched_cb_inc(event->ctx->pmu);
+       } else {
+               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+               /*
+                * If not all events can use larger buffer,
+                * roll back to threshold = 1
+                */
+               if (!first_pebs &&
+                   (ds->pebs_interrupt_threshold > threshold))
+                       perf_sched_cb_dec(event->ctx->pmu);
+       }
+
+       /* Use auto-reload if possible to save a MSR write in the PMI */
+       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+               ds->pebs_event_reset[hwc->idx] =
+                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+       }
+
+       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+               ds->pebs_interrupt_threshold = threshold;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool large_pebs = ds->pebs_interrupt_threshold >
+               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+       if (large_pebs)
+               intel_pmu_drain_pebs_buffer();
+
+       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+               cpuc->pebs_enabled &= ~(1ULL << 63);
+
+       if (large_pebs && !pebs_is_enabled(cpuc))
+               perf_sched_cb_dec(event->ctx->pmu);
+
+       if (cpuc->enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->pebs_enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->pebs_enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       unsigned long from = cpuc->lbr_entries[0].from;
+       unsigned long old_to, to = cpuc->lbr_entries[0].to;
+       unsigned long ip = regs->ip;
+       int is_64bit = 0;
+       void *kaddr;
+       int size;
+
+       /*
+        * We don't need to fixup if the PEBS assist is fault like
+        */
+       if (!x86_pmu.intel_cap.pebs_trap)
+               return 1;
+
+       /*
+        * No LBR entry, no basic block, no rewinding
+        */
+       if (!cpuc->lbr_stack.nr || !from || !to)
+               return 0;
+
+       /*
+        * Basic blocks should never cross user/kernel boundaries
+        */
+       if (kernel_ip(ip) != kernel_ip(to))
+               return 0;
+
+       /*
+        * unsigned math, either ip is before the start (impossible) or
+        * the basic block is larger than 1 page (sanity)
+        */
+       if ((ip - to) > PEBS_FIXUP_SIZE)
+               return 0;
+
+       /*
+        * We sampled a branch insn, rewind using the LBR stack
+        */
+       if (ip == to) {
+               set_linear_ip(regs, from);
+               return 1;
+       }
+
+       size = ip - to;
+       if (!kernel_ip(ip)) {
+               int bytes;
+               u8 *buf = this_cpu_read(insn_buffer);
+
+               /* 'size' must fit our buffer, see above */
+               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+               if (bytes != 0)
+                       return 0;
+
+               kaddr = buf;
+       } else {
+               kaddr = (void *)to;
+       }
+
+       do {
+               struct insn insn;
+
+               old_to = to;
+
+#ifdef CONFIG_X86_64
+               is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+               insn_init(&insn, kaddr, size, is_64bit);
+               insn_get_length(&insn);
+               /*
+                * Make sure there was not a problem decoding the
+                * instruction and getting the length.  This is
+                * doubly important because we have an infinite
+                * loop if insn.length=0.
+                */
+               if (!insn.length)
+                       break;
+
+               to += insn.length;
+               kaddr += insn.length;
+               size -= insn.length;
+       } while (to < ip);
+
+       if (to == ip) {
+               set_linear_ip(regs, old_to);
+               return 1;
+       }
+
+       /*
+        * Even though we decoded the basic block, the instruction stream
+        * never matched the given IP, either the TO or the IP got corrupted.
+        */
+       return 0;
+}
+
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+{
+       if (pebs->tsx_tuning) {
+               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+               return tsx.cycles_last_block;
+       }
+       return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+{
+       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+       /* For RTM XABORTs also log the abort code from AX */
+       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+       return txn;
+}
+
+static void setup_pebs_sample_data(struct perf_event *event,
+                                  struct pt_regs *iregs, void *__pebs,
+                                  struct perf_sample_data *data,
+                                  struct pt_regs *regs)
+{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+               (PERF_X86_EVENT_PEBS_ST_HSW | \
+                PERF_X86_EVENT_PEBS_LD_HSW | \
+                PERF_X86_EVENT_PEBS_NA_HSW)
+       /*
+        * We cast to the biggest pebs_record but are careful not to
+        * unconditionally access the 'extra' entries.
+        */
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct pebs_record_skl *pebs = __pebs;
+       u64 sample_type;
+       int fll, fst, dsrc;
+       int fl = event->hw.flags;
+
+       if (pebs == NULL)
+               return;
+
+       sample_type = event->attr.sample_type;
+       dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+       fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+       fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+       perf_sample_data_init(data, 0, event->hw.last_period);
+
+       data->period = event->hw.last_period;
+
+       /*
+        * Use latency for weight (only avail with PEBS-LL)
+        */
+       if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+               data->weight = pebs->lat;
+
+       /*
+        * data.data_src encodes the data source
+        */
+       if (dsrc) {
+               u64 val = PERF_MEM_NA;
+               if (fll)
+                       val = load_latency_data(pebs->dse);
+               else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+                       val = precise_datala_hsw(event, pebs->dse);
+               else if (fst)
+                       val = precise_store_data(pebs->dse);
+               data->data_src.val = val;
+       }
+
+       /*
+        * We use the interrupt regs as a base because the PEBS record
+        * does not contain a full regs set, specifically it seems to
+        * lack segment descriptors, which get used by things like
+        * user_mode().
+        *
+        * In the simple case fix up only the IP and BP,SP regs, for
+        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+        */
+       *regs = *iregs;
+       regs->flags = pebs->flags;
+       set_linear_ip(regs, pebs->ip);
+       regs->bp = pebs->bp;
+       regs->sp = pebs->sp;
+
+       if (sample_type & PERF_SAMPLE_REGS_INTR) {
+               regs->ax = pebs->ax;
+               regs->bx = pebs->bx;
+               regs->cx = pebs->cx;
+               regs->dx = pebs->dx;
+               regs->si = pebs->si;
+               regs->di = pebs->di;
+               regs->bp = pebs->bp;
+               regs->sp = pebs->sp;
+
+               regs->flags = pebs->flags;
+#ifndef CONFIG_X86_32
+               regs->r8 = pebs->r8;
+               regs->r9 = pebs->r9;
+               regs->r10 = pebs->r10;
+               regs->r11 = pebs->r11;
+               regs->r12 = pebs->r12;
+               regs->r13 = pebs->r13;
+               regs->r14 = pebs->r14;
+               regs->r15 = pebs->r15;
+#endif
+       }
+
+       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+               regs->ip = pebs->real_ip;
+               regs->flags |= PERF_EFLAGS_EXACT;
+       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+               regs->flags |= PERF_EFLAGS_EXACT;
+       else
+               regs->flags &= ~PERF_EFLAGS_EXACT;
+
+       if ((sample_type & PERF_SAMPLE_ADDR) &&
+           x86_pmu.intel_cap.pebs_format >= 1)
+               data->addr = pebs->dla;
+
+       if (x86_pmu.intel_cap.pebs_format >= 2) {
+               /* Only set the TSX weight when no memory weight. */
+               if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+                       data->weight = intel_hsw_weight(pebs);
+
+               if (sample_type & PERF_SAMPLE_TRANSACTION)
+                       data->txn = intel_hsw_transaction(pebs);
+       }
+
+       /*
+        * v3 supplies an accurate time stamp, so we use that
+        * for the time stamp.
+        *
+        * We can only do this for the default trace clock.
+        */
+       if (x86_pmu.intel_cap.pebs_format >= 3 &&
+               event->attr.use_clockid == 0)
+               data->time = native_sched_clock_from_tsc(pebs->tsc);
+
+       if (has_branch_stack(event))
+               data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       void *at;
+       u64 pebs_status;
+
+       /*
+        * fmt0 does not have a status bitfield (does not use
+        * perf_record_nhm format)
+        */
+       if (x86_pmu.intel_cap.pebs_format < 1)
+               return base;
+
+       if (base == NULL)
+               return NULL;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
+
+               if (test_bit(bit, (unsigned long *)&p->status)) {
+                       /* PEBS v3 has accurate status bits */
+                       if (x86_pmu.intel_cap.pebs_format >= 3)
+                               return at;
+
+                       if (p->status == (1 << bit))
+                               return at;
+
+                       /* clear non-PEBS bit and re-check */
+                       pebs_status = p->status & cpuc->pebs_enabled;
+                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+                       if (pebs_status == (1 << bit))
+                               return at;
+               }
+       }
+       return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+                                  struct pt_regs *iregs,
+                                  void *base, void *top,
+                                  int bit, int count)
+{
+       struct perf_sample_data data;
+       struct pt_regs regs;
+       void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+       if (!intel_pmu_save_and_restart(event) &&
+           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+               return;
+
+       while (count > 1) {
+               setup_pebs_sample_data(event, iregs, at, &data, &regs);
+               perf_event_output(event, &data, &regs);
+               at += x86_pmu.pebs_record_size;
+               at = get_next_pebs_record_by_bit(at, top, bit);
+               count--;
+       }
+
+       setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+       /*
+        * All but the last records are processed.
+        * The last one is left to be able to call the overflow handler.
+        */
+       if (perf_event_overflow(event, &data, &regs)) {
+               x86_pmu_stop(event, 0);
+               return;
+       }
+
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+       struct pebs_record_core *at, *top;
+       int n;
+
+       if (!x86_pmu.pebs_active)
+               return;
+
+       at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+       /*
+        * Whatever else happens, drain the thing
+        */
+       ds->pebs_index = ds->pebs_buffer_base;
+
+       if (!test_bit(0, cpuc->active_mask))
+               return;
+
+       WARN_ON_ONCE(!event);
+
+       if (!event->attr.precise_ip)
+               return;
+
+       n = top - at;
+       if (n <= 0)
+               return;
+
+       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct perf_event *event;
+       void *base, *at, *top;
+       short counts[MAX_PEBS_EVENTS] = {};
+       short error[MAX_PEBS_EVENTS] = {};
+       int bit, i;
+
+       if (!x86_pmu.pebs_active)
+               return;
+
+       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+       ds->pebs_index = ds->pebs_buffer_base;
+
+       if (unlikely(base >= top))
+               return;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
+               u64 pebs_status;
+
+               /* PEBS v3 has accurate status bits */
+               if (x86_pmu.intel_cap.pebs_format >= 3) {
+                       for_each_set_bit(bit, (unsigned long *)&p->status,
+                                        MAX_PEBS_EVENTS)
+                               counts[bit]++;
+
+                       continue;
+               }
+
+               pebs_status = p->status & cpuc->pebs_enabled;
+               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+               /*
+                * On some CPUs the PEBS status can be zero when PEBS is
+                * racing with clearing of GLOBAL_STATUS.
+                *
+                * Normally we would drop that record, but in the
+                * case when there is only a single active PEBS event
+                * we can assume it's for that event.
+                */
+               if (!pebs_status && cpuc->pebs_enabled &&
+                       !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+                       pebs_status = cpuc->pebs_enabled;
+
+               bit = find_first_bit((unsigned long *)&pebs_status,
+                                       x86_pmu.max_pebs_events);
+               if (bit >= x86_pmu.max_pebs_events)
+                       continue;
+
+               /*
+                * The PEBS hardware does not deal well with the situation
+                * when events happen near to each other and multiple bits
+                * are set. But it should happen rarely.
+                *
+                * If these events include one PEBS and multiple non-PEBS
+                * events, it doesn't impact PEBS record. The record will
+                * be handled normally. (slow path)
+                *
+                * If these events include two or more PEBS events, the
+                * records for the events can be collapsed into a single
+                * one, and it's not possible to reconstruct all events
+                * that caused the PEBS record. It's called collision.
+                * If collision happened, the record will be dropped.
+                */
+               if (p->status != (1ULL << bit)) {
+                       for_each_set_bit(i, (unsigned long *)&pebs_status,
+                                        x86_pmu.max_pebs_events)
+                               error[i]++;
+                       continue;
+               }
+
+               counts[bit]++;
+       }
+
+       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+               if ((counts[bit] == 0) && (error[bit] == 0))
+                       continue;
+
+               event = cpuc->events[bit];
+               WARN_ON_ONCE(!event);
+               WARN_ON_ONCE(!event->attr.precise_ip);
+
+               /* log dropped samples number */
+               if (error[bit])
+                       perf_log_lost_samples(event, error[bit]);
+
+               if (counts[bit]) {
+                       __intel_pmu_pebs_event(event, iregs, base,
+                                              top, bit, counts[bit]);
+               }
+       }
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+void __init intel_ds_init(void)
+{
+       /*
+        * No support for 32bit formats
+        */
+       if (!boot_cpu_has(X86_FEATURE_DTES64))
+               return;
+
+       x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+       x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+       x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+       if (x86_pmu.pebs) {
+               char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+               int format = x86_pmu.intel_cap.pebs_format;
+
+               switch (format) {
+               case 0:
+                       pr_cont("PEBS fmt0%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+                       /*
+                        * Using >PAGE_SIZE buffers makes the WRMSR to
+                        * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+                        * mysteriously hang on Core2.
+                        *
+                        * As a workaround, we don't do this.
+                        */
+                       x86_pmu.pebs_buffer_size = PAGE_SIZE;
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+                       break;
+
+               case 1:
+                       pr_cont("PEBS fmt1%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       break;
+
+               case 2:
+                       pr_cont("PEBS fmt2%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       break;
+
+               case 3:
+                       pr_cont("PEBS fmt3%c, ", pebs_type);
+                       x86_pmu.pebs_record_size =
+                                               sizeof(struct pebs_record_skl);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+                       break;
+
+               default:
+                       pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+                       x86_pmu.pebs = 0;
+               }
+       }
+}
+
+void perf_restore_debug_store(void)
+{
+       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c

new file mode 100644 (file)

index 0000000..548d5f7
--- /dev/null
+++ b/arch/x86/events/intel/knc.c
@@ -0,0 +1,321 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "../perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x002a,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x0016,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0028,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0029,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x0012,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               /* On Xeon Phi event "0" is a valid DATA_READ          */
+               /*   (L1 Data Cache Reads) Instruction.                */
+               /* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
+               /* bit will always be set in x86_pmu_hw_config().      */
+               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+                                               /* DATA_READ           */
+               [ C(RESULT_MISS)   ] = 0x0003,  /* DATA_READ_MISS      */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE          */
+               [ C(RESULT_MISS)   ] = 0x0004,  /* DATA_WRITE_MISS     */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0011,  /* L1_DATA_PF1         */
+               [ C(RESULT_MISS)   ] = 0x001c,  /* L1_DATA_PF1_MISS    */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ          */
+               [ C(RESULT_MISS)   ] = 0x000e,  /* CODE_CACHE_MISS    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x10cb,  /* L2_READ_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x10cc,  /* L2_WRITE_HIT */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x10fc,  /* L2_DATA_PF2      */
+               [ C(RESULT_MISS)   ] = 0x10fe,  /* L2_DATA_PF2_MISS */
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+                                               /* DATA_READ */
+                                               /* see note on L1 OP_READ */
+               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE */
+               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ */
+               [ C(RESULT_MISS)   ] = 0x000d,  /* CODE_PAGE_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0012,  /* BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x002b,  /* BRANCHES_MISPREDICTED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+       return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+       INTEL_EVENT_CONSTRAINT(0xc3, 0x1),      /* HWP_L2HIT */
+       INTEL_EVENT_CONSTRAINT(0xc4, 0x1),      /* HWP_L2MISS */
+       INTEL_EVENT_CONSTRAINT(0xc8, 0x1),      /* L2_READ_HIT_E */
+       INTEL_EVENT_CONSTRAINT(0xc9, 0x1),      /* L2_READ_HIT_M */
+       INTEL_EVENT_CONSTRAINT(0xca, 0x1),      /* L2_READ_HIT_S */
+       INTEL_EVENT_CONSTRAINT(0xcb, 0x1),      /* L2_READ_MISS */
+       INTEL_EVENT_CONSTRAINT(0xcc, 0x1),      /* L2_WRITE_HIT */
+       INTEL_EVENT_CONSTRAINT(0xce, 0x1),      /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+       INTEL_EVENT_CONSTRAINT(0xcf, 0x1),      /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+       INTEL_EVENT_CONSTRAINT(0xd7, 0x1),      /* L2_VICTIM_REQ_WITH_DATA */
+       INTEL_EVENT_CONSTRAINT(0xe3, 0x1),      /* SNP_HITM_BUNIT */
+       INTEL_EVENT_CONSTRAINT(0xe6, 0x1),      /* SNP_HIT_L2 */
+       INTEL_EVENT_CONSTRAINT(0xe7, 0x1),      /* SNP_HITM_L2 */
+       INTEL_EVENT_CONSTRAINT(0xf1, 0x1),      /* L2_DATA_READ_MISS_CACHE_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf2, 0x1),      /* L2_DATA_WRITE_MISS_CACHE_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf6, 0x1),      /* L2_DATA_READ_MISS_MEM_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf7, 0x1),      /* L2_DATA_WRITE_MISS_MEM_FILL */
+       INTEL_EVENT_CONSTRAINT(0xfc, 0x1),      /* L2_DATA_PF2 */
+       INTEL_EVENT_CONSTRAINT(0xfd, 0x1),      /* L2_DATA_PF2_DROP */
+       INTEL_EVENT_CONSTRAINT(0xfe, 0x1),      /* L2_DATA_PF2_MISS */
+       INTEL_EVENT_CONSTRAINT(0xff, 0x1),      /* L2_DATA_HIT_INFLIGHT_PF2 */
+       EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS                0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL   0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL          0x0000002f
+
+#define KNC_ENABLE_COUNTER0                    0x00000001
+#define KNC_ENABLE_COUNTER1                    0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+       val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+       u64 val;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+       val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int handled = 0;
+       int bit, loops;
+       u64 status;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       knc_pmu_disable_all();
+
+       status = knc_pmu_get_status();
+       if (!status) {
+               knc_pmu_enable_all(0);
+               return handled;
+       }
+
+       loops = 0;
+again:
+       knc_pmu_ack_status(status);
+       if (++loops > 100) {
+               WARN_ONCE(1, "perf: irq loop stuck!\n");
+               perf_event_print_debug();
+               goto done;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+
+       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               handled++;
+
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = knc_pmu_get_status();
+       if (status)
+               goto again;
+
+done:
+       /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+       if (cpuc->enabled)
+               knc_pmu_enable_all(0);
+
+       return handled;
+}
+
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *intel_knc_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+       .name                   = "knc",
+       .handle_irq             = knc_pmu_handle_irq,
+       .disable_all            = knc_pmu_disable_all,
+       .enable_all             = knc_pmu_enable_all,
+       .enable                 = knc_pmu_enable_event,
+       .disable                = knc_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_KNC_EVNTSEL0,
+       .perfctr                = MSR_KNC_PERFCTR0,
+       .event_map              = knc_pmu_event_map,
+       .max_events             = ARRAY_SIZE(knc_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 39) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       .cntval_bits            = 40,
+       .cntval_mask            = (1ULL << 40) - 1,
+       .get_event_constraints  = x86_get_event_constraints,
+       .event_constraints      = knc_event_constraints,
+       .format_attrs           = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+       x86_pmu = knc_pmu;
+
+       memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
+               sizeof(hw_cache_event_ids));
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c

new file mode 100644 (file)

index 0000000..69dd118
--- /dev/null
+++ b/arch/x86/events/intel/lbr.c
@@ -0,0 +1,1062 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+enum {
+       LBR_FORMAT_32           = 0x00,
+       LBR_FORMAT_LIP          = 0x01,
+       LBR_FORMAT_EIP          = 0x02,
+       LBR_FORMAT_EIP_FLAGS    = 0x03,
+       LBR_FORMAT_EIP_FLAGS2   = 0x04,
+       LBR_FORMAT_INFO         = 0x05,
+       LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
+};
+
+static enum {
+       LBR_EIP_FLAGS           = 1,
+       LBR_TSX                 = 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+       [LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
+       [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT         0 /* do not capture at ring0 */
+#define LBR_USER_BIT           1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT            2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT       3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT       4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT         5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT                6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT                7 /* do not capture relative jumps */
+#define LBR_FAR_BIT            8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT     9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT               63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL     (1 << LBR_KERNEL_BIT)
+#define LBR_USER       (1 << LBR_USER_BIT)
+#define LBR_JCC                (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL   (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL   (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN     (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP    (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP    (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR                (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO    (1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK   0x1ff   /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP   -1      /* LBR filter not supported */
+#define LBR_IGN                0       /* ignored */
+
+#define LBR_ANY                 \
+       (LBR_JCC        |\
+        LBR_REL_CALL   |\
+        LBR_IND_CALL   |\
+        LBR_RETURN     |\
+        LBR_REL_JMP    |\
+        LBR_IND_JMP    |\
+        LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+       X86_BR_NONE             = 0,      /* unknown */
+
+       X86_BR_USER             = 1 << 0, /* branch target is user */
+       X86_BR_KERNEL           = 1 << 1, /* branch target is kernel */
+
+       X86_BR_CALL             = 1 << 2, /* call */
+       X86_BR_RET              = 1 << 3, /* return */
+       X86_BR_SYSCALL          = 1 << 4, /* syscall */
+       X86_BR_SYSRET           = 1 << 5, /* syscall return */
+       X86_BR_INT              = 1 << 6, /* sw interrupt */
+       X86_BR_IRET             = 1 << 7, /* return from interrupt */
+       X86_BR_JCC              = 1 << 8, /* conditional */
+       X86_BR_JMP              = 1 << 9, /* jump */
+       X86_BR_IRQ              = 1 << 10,/* hw interrupt or trap or fault */
+       X86_BR_IND_CALL         = 1 << 11,/* indirect calls */
+       X86_BR_ABORT            = 1 << 12,/* transaction abort */
+       X86_BR_IN_TX            = 1 << 13,/* in transaction */
+       X86_BR_NO_TX            = 1 << 14,/* not in transaction */
+       X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
+       X86_BR_CALL_STACK       = 1 << 16,/* call stack */
+       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY       \
+       (X86_BR_CALL    |\
+        X86_BR_RET     |\
+        X86_BR_SYSCALL |\
+        X86_BR_SYSRET  |\
+        X86_BR_INT     |\
+        X86_BR_IRET    |\
+        X86_BR_JCC     |\
+        X86_BR_JMP      |\
+        X86_BR_IRQ      |\
+        X86_BR_ABORT    |\
+        X86_BR_IND_CALL |\
+        X86_BR_IND_JMP  |\
+        X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL                 \
+       (X86_BR_CALL            |\
+        X86_BR_IND_CALL        |\
+        X86_BR_ZERO_CALL       |\
+        X86_BR_SYSCALL         |\
+        X86_BR_IRQ             |\
+        X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       u64 debugctl, lbr_select = 0, orig_debugctl;
+
+       /*
+        * No need to unfreeze manually, as v4 can do that as part
+        * of the GLOBAL_STATUS ack.
+        */
+       if (pmi && x86_pmu.version >= 4)
+               return;
+
+       /*
+        * No need to reprogram LBR_SELECT in a PMI, as it
+        * did not change.
+        */
+       if (cpuc->lbr_sel)
+               lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+       if (!pmi && cpuc->lbr_sel)
+               wrmsrl(MSR_LBR_SELECT, lbr_select);
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       orig_debugctl = debugctl;
+       debugctl |= DEBUGCTLMSR_LBR;
+       /*
+        * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+        * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+        * may cause superfluous increase/decrease of LBR_TOS.
+        */
+       if (!(lbr_select & LBR_CALL_STACK))
+               debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+       if (orig_debugctl != debugctl)
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+       u64 debugctl;
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++)
+               wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               wrmsrl(x86_pmu.lbr_from + i, 0);
+               wrmsrl(x86_pmu.lbr_to   + i, 0);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       wrmsrl(MSR_LBR_INFO_0 + i, 0);
+       }
+}
+
+void intel_pmu_lbr_reset(void)
+{
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+               intel_pmu_lbr_reset_32();
+       else
+               intel_pmu_lbr_reset_64();
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+       u64 tos;
+
+       rdmsrl(x86_pmu.lbr_tos, tos);
+       return tos;
+}
+
+enum {
+       LBR_NONE,
+       LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+       int i;
+       unsigned lbr_idx, mask;
+       u64 tos;
+
+       if (task_ctx->lbr_callstack_users == 0 ||
+           task_ctx->lbr_stack_state == LBR_NONE) {
+               intel_pmu_lbr_reset();
+               return;
+       }
+
+       mask = x86_pmu.lbr_nr - 1;
+       tos = task_ctx->tos;
+       for (i = 0; i < tos; i++) {
+               lbr_idx = (tos - i) & mask;
+               wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+               wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+       }
+       wrmsrl(x86_pmu.lbr_tos, tos);
+       task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+       int i;
+       unsigned lbr_idx, mask;
+       u64 tos;
+
+       if (task_ctx->lbr_callstack_users == 0) {
+               task_ctx->lbr_stack_state = LBR_NONE;
+               return;
+       }
+
+       mask = x86_pmu.lbr_nr - 1;
+       tos = intel_pmu_lbr_tos();
+       for (i = 0; i < tos; i++) {
+               lbr_idx = (tos - i) & mask;
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+               rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+       }
+       task_ctx->tos = tos;
+       task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       /*
+        * If LBR callstack feature is enabled and the stack was saved when
+        * the task was scheduled out, restore the stack. Otherwise flush
+        * the LBR stack.
+        */
+       task_ctx = ctx ? ctx->task_ctx_data : NULL;
+       if (task_ctx) {
+               if (sched_in) {
+                       __intel_pmu_lbr_restore(task_ctx);
+                       cpuc->lbr_context = ctx;
+               } else {
+                       __intel_pmu_lbr_save(task_ctx);
+               }
+               return;
+       }
+
+       /*
+        * When sampling the branck stack in system-wide, it may be
+        * necessary to flush the stack on context switch. This happens
+        * when the branch stack does not tag its entries with the pid
+        * of the current task. Otherwise it becomes impossible to
+        * associate a branch entry with a task. This ambiguity is more
+        * likely to appear when the branch stack supports priv level
+        * filtering and the user sets it to monitor only at the user
+        * level (which could be a useful measurement in system-wide
+        * mode). In that case, the risk is high of having a branch
+        * stack with branch from multiple tasks.
+        */
+       if (sched_in) {
+               intel_pmu_lbr_reset();
+               cpuc->lbr_context = ctx;
+       }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+       return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       /*
+        * Reset the LBR stack if we changed task context to
+        * avoid data leaks.
+        */
+       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+               intel_pmu_lbr_reset();
+               cpuc->lbr_context = event->ctx;
+       }
+       cpuc->br_sel = event->hw.branch_reg.reg;
+
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+                                       event->ctx->task_ctx_data) {
+               task_ctx = event->ctx->task_ctx_data;
+               task_ctx->lbr_callstack_users++;
+       }
+
+       cpuc->lbr_users++;
+       perf_sched_cb_inc(event->ctx->pmu);
+}
+
+void intel_pmu_lbr_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+                                       event->ctx->task_ctx_data) {
+               task_ctx = event->ctx->task_ctx_data;
+               task_ctx->lbr_callstack_users--;
+       }
+
+       cpuc->lbr_users--;
+       WARN_ON_ONCE(cpuc->lbr_users < 0);
+       perf_sched_cb_dec(event->ctx->pmu);
+
+       if (cpuc->enabled && !cpuc->lbr_users) {
+               __intel_pmu_lbr_disable();
+               /* avoid stale pointer */
+               cpuc->lbr_context = NULL;
+       }
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->lbr_users)
+               __intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->lbr_users)
+               __intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+       unsigned long mask = x86_pmu.lbr_nr - 1;
+       u64 tos = intel_pmu_lbr_tos();
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               unsigned long lbr_idx = (tos - i) & mask;
+               union {
+                       struct {
+                               u32 from;
+                               u32 to;
+                       };
+                       u64     lbr;
+               } msr_lastbranch;
+
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+               cpuc->lbr_entries[i].from       = msr_lastbranch.from;
+               cpuc->lbr_entries[i].to         = msr_lastbranch.to;
+               cpuc->lbr_entries[i].mispred    = 0;
+               cpuc->lbr_entries[i].predicted  = 0;
+               cpuc->lbr_entries[i].reserved   = 0;
+       }
+       cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+       bool need_info = false;
+       unsigned long mask = x86_pmu.lbr_nr - 1;
+       int lbr_format = x86_pmu.intel_cap.lbr_format;
+       u64 tos = intel_pmu_lbr_tos();
+       int i;
+       int out = 0;
+       int num = x86_pmu.lbr_nr;
+
+       if (cpuc->lbr_sel) {
+               need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+               if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+                       num = tos;
+       }
+
+       for (i = 0; i < num; i++) {
+               unsigned long lbr_idx = (tos - i) & mask;
+               u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+               int skip = 0;
+               u16 cycles = 0;
+               int lbr_flags = lbr_desc[lbr_format];
+
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+               if (lbr_format == LBR_FORMAT_INFO && need_info) {
+                       u64 info;
+
+                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+                       mis = !!(info & LBR_INFO_MISPRED);
+                       pred = !mis;
+                       in_tx = !!(info & LBR_INFO_IN_TX);
+                       abort = !!(info & LBR_INFO_ABORT);
+                       cycles = (info & LBR_INFO_CYCLES);
+               }
+               if (lbr_flags & LBR_EIP_FLAGS) {
+                       mis = !!(from & LBR_FROM_FLAG_MISPRED);
+                       pred = !mis;
+                       skip = 1;
+               }
+               if (lbr_flags & LBR_TSX) {
+                       in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+                       abort = !!(from & LBR_FROM_FLAG_ABORT);
+                       skip = 3;
+               }
+               from = (u64)((((s64)from) << skip) >> skip);
+
+               /*
+                * Some CPUs report duplicated abort records,
+                * with the second entry not having an abort bit set.
+                * Skip them here. This loop runs backwards,
+                * so we need to undo the previous record.
+                * If the abort just happened outside the window
+                * the extra entry cannot be removed.
+                */
+               if (abort && x86_pmu.lbr_double_abort && out > 0)
+                       out--;
+
+               cpuc->lbr_entries[out].from      = from;
+               cpuc->lbr_entries[out].to        = to;
+               cpuc->lbr_entries[out].mispred   = mis;
+               cpuc->lbr_entries[out].predicted = pred;
+               cpuc->lbr_entries[out].in_tx     = in_tx;
+               cpuc->lbr_entries[out].abort     = abort;
+               cpuc->lbr_entries[out].cycles    = cycles;
+               cpuc->lbr_entries[out].reserved  = 0;
+               out++;
+       }
+       cpuc->lbr_stack.nr = out;
+}
+
+void intel_pmu_lbr_read(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (!cpuc->lbr_users)
+               return;
+
+       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+               intel_pmu_lbr_read_32(cpuc);
+       else
+               intel_pmu_lbr_read_64(cpuc);
+
+       intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+       u64 br_type = event->attr.branch_sample_type;
+       int mask = 0;
+
+       if (br_type & PERF_SAMPLE_BRANCH_USER)
+               mask |= X86_BR_USER;
+
+       if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+               mask |= X86_BR_KERNEL;
+
+       /* we ignore BRANCH_HV here */
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY)
+               mask |= X86_BR_ANY;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+               mask |= X86_BR_ANY_CALL;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+               mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+       if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+               mask |= X86_BR_IND_CALL;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+               mask |= X86_BR_ABORT;
+
+       if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+               mask |= X86_BR_IN_TX;
+
+       if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+               mask |= X86_BR_NO_TX;
+
+       if (br_type & PERF_SAMPLE_BRANCH_COND)
+               mask |= X86_BR_JCC;
+
+       if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+               if (!x86_pmu_has_lbr_callstack())
+                       return -EOPNOTSUPP;
+               if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+                       return -EINVAL;
+               mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+                       X86_BR_CALL_STACK;
+       }
+
+       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+               mask |= X86_BR_IND_JMP;
+
+       if (br_type & PERF_SAMPLE_BRANCH_CALL)
+               mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+       /*
+        * stash actual user request into reg, it may
+        * be used by fixup code for some CPU
+        */
+       event->hw.branch_reg.reg = mask;
+       return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+       u64 br_type = event->attr.branch_sample_type;
+       u64 mask = 0, v;
+       int i;
+
+       for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+               if (!(br_type & (1ULL << i)))
+                       continue;
+
+               v = x86_pmu.lbr_sel_map[i];
+               if (v == LBR_NOT_SUPP)
+                       return -EOPNOTSUPP;
+
+               if (v != LBR_IGN)
+                       mask |= v;
+       }
+
+       reg = &event->hw.branch_reg;
+       reg->idx = EXTRA_REG_LBR;
+
+       /*
+        * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+        * in suppress mode. So LBR_SELECT should be set to
+        * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+        */
+       reg->config = mask ^ x86_pmu.lbr_sel_mask;
+
+       if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+           (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+           (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+               reg->config |= LBR_NO_INFO;
+
+       return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+       int ret = 0;
+
+       /*
+        * no LBR on this PMU
+        */
+       if (!x86_pmu.lbr_nr)
+               return -EOPNOTSUPP;
+
+       /*
+        * setup SW LBR filter
+        */
+       ret = intel_pmu_setup_sw_lbr_filter(event);
+       if (ret)
+               return ret;
+
+       /*
+        * setup HW LBR filter, if any
+        */
+       if (x86_pmu.lbr_sel_map)
+               ret = intel_pmu_setup_hw_lbr_filter(event);
+
+       return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+       struct insn insn;
+       void *addr;
+       int bytes_read, bytes_left;
+       int ret = X86_BR_NONE;
+       int ext, to_plm, from_plm;
+       u8 buf[MAX_INSN_SIZE];
+       int is64 = 0;
+
+       to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+       from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+       /*
+        * maybe zero if lbr did not fill up after a reset by the time
+        * we get a PMU interrupt
+        */
+       if (from == 0 || to == 0)
+               return X86_BR_NONE;
+
+       if (abort)
+               return X86_BR_ABORT | to_plm;
+
+       if (from_plm == X86_BR_USER) {
+               /*
+                * can happen if measuring at the user level only
+                * and we interrupt in a kernel thread, e.g., idle.
+                */
+               if (!current->mm)
+                       return X86_BR_NONE;
+
+               /* may fail if text not present */
+               bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+                                               MAX_INSN_SIZE);
+               bytes_read = MAX_INSN_SIZE - bytes_left;
+               if (!bytes_read)
+                       return X86_BR_NONE;
+
+               addr = buf;
+       } else {
+               /*
+                * The LBR logs any address in the IP, even if the IP just
+                * faulted. This means userspace can control the from address.
+                * Ensure we don't blindy read any address by validating it is
+                * a known text address.
+                */
+               if (kernel_text_address(from)) {
+                       addr = (void *)from;
+                       /*
+                        * Assume we can get the maximum possible size
+                        * when grabbing kernel data.  This is not
+                        * _strictly_ true since we could possibly be
+                        * executing up next to a memory hole, but
+                        * it is very unlikely to be a problem.
+                        */
+                       bytes_read = MAX_INSN_SIZE;
+               } else {
+                       return X86_BR_NONE;
+               }
+       }
+
+       /*
+        * decoder needs to know the ABI especially
+        * on 64-bit systems running 32-bit apps
+        */
+#ifdef CONFIG_X86_64
+       is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+       insn_init(&insn, addr, bytes_read, is64);
+       insn_get_opcode(&insn);
+       if (!insn.opcode.got)
+               return X86_BR_ABORT;
+
+       switch (insn.opcode.bytes[0]) {
+       case 0xf:
+               switch (insn.opcode.bytes[1]) {
+               case 0x05: /* syscall */
+               case 0x34: /* sysenter */
+                       ret = X86_BR_SYSCALL;
+                       break;
+               case 0x07: /* sysret */
+               case 0x35: /* sysexit */
+                       ret = X86_BR_SYSRET;
+                       break;
+               case 0x80 ... 0x8f: /* conditional */
+                       ret = X86_BR_JCC;
+                       break;
+               default:
+                       ret = X86_BR_NONE;
+               }
+               break;
+       case 0x70 ... 0x7f: /* conditional */
+               ret = X86_BR_JCC;
+               break;
+       case 0xc2: /* near ret */
+       case 0xc3: /* near ret */
+       case 0xca: /* far ret */
+       case 0xcb: /* far ret */
+               ret = X86_BR_RET;
+               break;
+       case 0xcf: /* iret */
+               ret = X86_BR_IRET;
+               break;
+       case 0xcc ... 0xce: /* int */
+               ret = X86_BR_INT;
+               break;
+       case 0xe8: /* call near rel */
+               insn_get_immediate(&insn);
+               if (insn.immediate1.value == 0) {
+                       /* zero length call */
+                       ret = X86_BR_ZERO_CALL;
+                       break;
+               }
+       case 0x9a: /* call far absolute */
+               ret = X86_BR_CALL;
+               break;
+       case 0xe0 ... 0xe3: /* loop jmp */
+               ret = X86_BR_JCC;
+               break;
+       case 0xe9 ... 0xeb: /* jmp */
+               ret = X86_BR_JMP;
+               break;
+       case 0xff: /* call near absolute, call far absolute ind */
+               insn_get_modrm(&insn);
+               ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+               switch (ext) {
+               case 2: /* near ind call */
+               case 3: /* far ind call */
+                       ret = X86_BR_IND_CALL;
+                       break;
+               case 4:
+               case 5:
+                       ret = X86_BR_IND_JMP;
+                       break;
+               }
+               break;
+       default:
+               ret = X86_BR_NONE;
+       }
+       /*
+        * interrupts, traps, faults (and thus ring transition) may
+        * occur on any instructions. Thus, to classify them correctly,
+        * we need to first look at the from and to priv levels. If they
+        * are different and to is in the kernel, then it indicates
+        * a ring transition. If the from instruction is not a ring
+        * transition instr (syscall, systenter, int), then it means
+        * it was a irq, trap or fault.
+        *
+        * we have no way of detecting kernel to kernel faults.
+        */
+       if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+           && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+               ret = X86_BR_IRQ;
+
+       /*
+        * branch priv level determined by target as
+        * is done by HW when LBR_SELECT is implemented
+        */
+       if (ret != X86_BR_NONE)
+               ret |= to_plm;
+
+       return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+       u64 from, to;
+       int br_sel = cpuc->br_sel;
+       int i, j, type;
+       bool compress = false;
+
+       /* if sampling all branches, then nothing to filter */
+       if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+               return;
+
+       for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+               from = cpuc->lbr_entries[i].from;
+               to = cpuc->lbr_entries[i].to;
+
+               type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+               if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+                       if (cpuc->lbr_entries[i].in_tx)
+                               type |= X86_BR_IN_TX;
+                       else
+                               type |= X86_BR_NO_TX;
+               }
+
+               /* if type does not correspond, then discard */
+               if (type == X86_BR_NONE || (br_sel & type) != type) {
+                       cpuc->lbr_entries[i].from = 0;
+                       compress = true;
+               }
+       }
+
+       if (!compress)
+               return;
+
+       /* remove all entries with from=0 */
+       for (i = 0; i < cpuc->lbr_stack.nr; ) {
+               if (!cpuc->lbr_entries[i].from) {
+                       j = i;
+                       while (++j < cpuc->lbr_stack.nr)
+                               cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+                       cpuc->lbr_stack.nr--;
+                       if (!cpuc->lbr_entries[i].from)
+                               continue;
+               }
+               i++;
+       }
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_REL_JMP
+                                               | LBR_IND_JMP | LBR_FAR,
+       /*
+        * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+        */
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+        LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+       /*
+        * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+        */
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_RETURN | LBR_CALL_STACK,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+       x86_pmu.lbr_nr     = 4;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+       /*
+        * SW branch filter usage:
+        * - compensate for lack of HW filter
+        */
+       pr_cont("4-deep LBR, ");
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+       x86_pmu.lbr_nr     = 16;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - workaround LBR_SEL errata (see above)
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+       x86_pmu.lbr_nr   = 16;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("16-deep LBR, ");
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+       x86_pmu.lbr_nr   = 16;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+       pr_cont("16-deep LBR, ");
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+       x86_pmu.lbr_nr   = 32;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("32-deep LBR, ");
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+       /*
+        * only models starting at stepping 10 seems
+        * to have an operational LBR which can freeze
+        * on PMU interrupt
+        */
+       if (boot_cpu_data.x86_model == 28
+           && boot_cpu_data.x86_mask < 10) {
+               pr_cont("LBR disabled due to erratum");
+               return;
+       }
+
+       x86_pmu.lbr_nr     = 8;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+       /*
+        * SW branch filter usage:
+        * - compensate for lack of HW filter
+        */
+       pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+       x86_pmu.lbr_nr     = 8;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+       pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c

new file mode 100644 (file)

index 0000000..0a5ede1
--- /dev/null
+++ b/arch/x86/events/intel/p4.c
@@ -0,0 +1,1376 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+       unsigned int opcode;                    /* Event code and ESCR selector */
+       unsigned int escr_msr[2];               /* ESCR MSR for this event */
+       unsigned int escr_emask;                /* valid ESCR EventMask bits */
+       unsigned int shared;                    /* event is shared across threads */
+       char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+       unsigned int metric_pebs;
+       unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)                     \
+       [P4_PEBS_METRIC__##name] = {                            \
+               .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
+               .metric_vert = vert,                            \
+       }
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+       P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
+       P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
+       P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
+       P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
+       P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
+       P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
+       P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
+       P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
+       P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+       [P4_EVENT_TC_DELIVER_MODE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+               .shared         = 1,
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_BPU_FETCH_REQUEST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+               .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_ITLB_REFERENCE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+               .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_MEMORY_CANCEL] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_MEMORY_COMPLETE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_LOAD_PORT_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_STORE_PORT_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_MOB_LOAD_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+               .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_PAGE_WALK_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+               .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+               .shared         = 1,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_IOQ_ALLOCATION] = {
+               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+               .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+               .cntr           = { {2, -1, -1}, {3, -1, -1} },
+       },
+       [P4_EVENT_FSB_DATA_ACTIVITY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+               .shared         = 1,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+               .cntr           = { {0, -1, -1}, {1, -1, -1} },
+       },
+       [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+               .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+               .cntr           = { {2, -1, -1}, {3, -1, -1} },
+       },
+       [P4_EVENT_SSE_INPUT_ASSIST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_PACKED_SP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_PACKED_DP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_SCALAR_SP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_SCALAR_DP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_64BIT_MMX_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_128BIT_MMX_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_X87_FP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_TC_MISC] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
+               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+               .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_TC_MS_XFER] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_UOP_QUEUE_WRITES] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RESOURCE_STALL] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+               .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_WC_BUFFER] = {
+               .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
+               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_B2B_CYCLES] = {
+               .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BNR] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BNR),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_SNOOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_RESPONSE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_FRONT_END_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_EXECUTION_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_REPLAY_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_INSTR_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_UOPS_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_UOP_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
+               .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_BRANCH_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_X87_ASSIST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_MACHINE_CLEAR] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_INSTR_COMPLETED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric)                           \
+       p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
+                           P4_ESCR_EMASK_BIT(event, bit))              | \
+       p4_config_pack_cccr(metric                                      | \
+                           P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+       },
+},
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__dtlb_load_miss_retired),
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__dtlb_store_miss_retired),
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+                                               P4_PEBS_METRIC__none),
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+                                               P4_PEBS_METRIC__none),
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+       u64 original;
+       u64 alternative;
+} p4_event_aliases[] = {
+       {
+               /*
+                * Non-halted cycles can be substituted with non-sleeping cycles (see
+                * Intel SDM Vol3b for details). We need this alias to be able
+                * to run nmi-watchdog and 'perf top' (or any other user space tool
+                * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+                * simultaneously.
+                */
+       .original       =
+               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+       .alternative    =
+               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+               p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
+                                   P4_CCCR_COMPARE),
+       },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+       u64 config_match;
+       int i;
+
+       /*
+        * Only event with special mark is allowed,
+        * we're to be sure it didn't come as malformed
+        * RAW event.
+        */
+       if (!(config & P4_CONFIG_ALIASABLE))
+               return 0;
+
+       config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+       for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+               if (config_match == p4_event_aliases[i].original) {
+                       config_match = p4_event_aliases[i].alternative;
+                       break;
+               } else if (config_match == p4_event_aliases[i].alternative) {
+                       config_match = p4_event_aliases[i].original;
+                       break;
+               }
+       }
+
+       if (i >= ARRAY_SIZE(p4_event_aliases))
+               return 0;
+
+       return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
+               P4_CONFIG_ALIASABLE,
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
+               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]        =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
+               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
+       p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+       unsigned int evnt = p4_config_unpack_event(config);
+       struct p4_event_bind *bind = NULL;
+
+       if (evnt < ARRAY_SIZE(p4_event_bind_map))
+               bind = &p4_event_bind_map[evnt];
+
+       return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+       struct p4_event_bind *bind;
+       unsigned int esel;
+       u64 config;
+
+       config = p4_general_events[hw_event];
+       bind = p4_config_get_bind(config);
+       esel = P4_OPCODE_ESEL(bind->opcode);
+       config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+       return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+               if (boot_cpu_data.x86_model != 3 &&
+                       boot_cpu_data.x86_model != 4 &&
+                       boot_cpu_data.x86_model != 6)
+                       return false;
+       }
+
+       /*
+        * For info
+        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+        */
+
+       return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+       unsigned int v, emask;
+
+       /* User data may have out-of-bound event index */
+       v = p4_config_unpack_event(event->attr.config);
+       if (v >= ARRAY_SIZE(p4_event_bind_map))
+               return -EINVAL;
+
+       /* It may be unsupported: */
+       if (!p4_event_match_cpu_model(v))
+               return -EINVAL;
+
+       /*
+        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+        * in Architectural Performance Monitoring, it means not
+        * on _which_ logical cpu to count but rather _when_, ie it
+        * depends on logical cpu state -- count event if one cpu active,
+        * none, both or any, so we just allow user to pass any value
+        * desired.
+        *
+        * In turn we always set Tx_OS/Tx_USR bits bound to logical
+        * cpu without their propagation to another cpu
+        */
+
+       /*
+        * if an event is shared across the logical threads
+        * the user needs special permissions to be able to use it
+        */
+       if (p4_ht_active() && p4_event_bind_map[v].shared) {
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
+
+       /* ESCR EventMask bits may be invalid */
+       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+       if (emask & ~p4_event_bind_map[v].escr_emask)
+               return -EINVAL;
+
+       /*
+        * it may have some invalid PEBS bits
+        */
+       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+               return -EINVAL;
+
+       v = p4_config_unpack_metric(event->attr.config);
+       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+               return -EINVAL;
+
+       return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+       int cpu = get_cpu();
+       int rc = 0;
+       u32 escr, cccr;
+
+       /*
+        * the reason we use cpu that early is that: if we get scheduled
+        * first time on the same cpu -- we will not need swap thread
+        * specific flags in config (and will save some cpu cycles)
+        */
+
+       cccr = p4_default_cccr_conf(cpu);
+       escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+                                        event->attr.exclude_user);
+       event->hw.config = p4_config_pack_escr(escr) |
+                          p4_config_pack_cccr(cccr);
+
+       if (p4_ht_active() && p4_ht_thread(cpu))
+               event->hw.config = p4_set_ht_bit(event->hw.config);
+
+       if (event->attr.type == PERF_TYPE_RAW) {
+               struct p4_event_bind *bind;
+               unsigned int esel;
+               /*
+                * Clear bits we reserve to be managed by kernel itself
+                * and never allowed from a user space
+                */
+                event->attr.config &= P4_CONFIG_MASK;
+
+               rc = p4_validate_raw_event(event);
+               if (rc)
+                       goto out;
+
+               /*
+                * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+                * bits since we keep additional info here (for cache events and etc)
+                */
+               event->hw.config |= event->attr.config;
+               bind = p4_config_get_bind(event->attr.config);
+               if (!bind) {
+                       rc = -EINVAL;
+                       goto out;
+               }
+               esel = P4_OPCODE_ESEL(bind->opcode);
+               event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+       }
+
+       rc = x86_setup_perfctr(event);
+out:
+       put_cpu();
+       return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+       u64 v;
+
+       /* an official way for overflow indication */
+       rdmsrl(hwc->config_base, v);
+       if (v & P4_CCCR_OVF) {
+               wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
+               return 1;
+       }
+
+       /*
+        * In some circumstances the overflow might issue an NMI but did
+        * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+        * we simply check for high bit being set, if it's cleared it means
+        * the counter has reached zero value and continued counting before
+        * real NMI signal was received:
+        */
+       rdmsrl(hwc->event_base, v);
+       if (!(v & ARCH_P4_UNFLAGGED_BIT))
+               return 1;
+
+       return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+       /*
+        * FIXME
+        *
+        * It's still allowed that two threads setup same cache
+        * events so we can't simply clear metrics until we knew
+        * no one is depending on us, so we need kind of counter
+        * for "ReplayEvent" users.
+        *
+        * What is more complex -- RAW events, if user (for some
+        * reason) will pass some cache event metric with improper
+        * event opcode -- it's fine from hardware point of view
+        * but completely nonsense from "meaning" of such action.
+        *
+        * So at moment let leave metrics turned on forever -- it's
+        * ok for now but need to be revisited!
+        *
+        * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+        * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+        */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       /*
+        * If event gets disabled while counter is in overflowed
+        * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+        * asserted again and again
+        */
+       (void)wrmsrl_safe(hwc->config_base,
+               p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               p4_pmu_disable_event(event);
+       }
+
+       p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+       struct p4_pebs_bind *bind;
+       unsigned int idx;
+
+       BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+       idx = p4_config_unpack_metric(config);
+       if (idx == P4_PEBS_METRIC__none)
+               return;
+
+       bind = &p4_pebs_bind_map[idx];
+
+       (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+       (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,      (u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int thread = p4_ht_config_thread(hwc->config);
+       u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+       unsigned int idx = p4_config_unpack_event(hwc->config);
+       struct p4_event_bind *bind;
+       u64 escr_addr, cccr;
+
+       bind = &p4_event_bind_map[idx];
+       escr_addr = bind->escr_msr[thread];
+
+       /*
+        * - we dont support cascaded counters yet
+        * - and counter 1 is broken (erratum)
+        */
+       WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+       WARN_ON_ONCE(hwc->idx == 1);
+
+       /* we need a real Event value */
+       escr_conf &= ~P4_ESCR_EVENT_MASK;
+       escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+       cccr = p4_config_unpack_cccr(hwc->config);
+
+       /*
+        * it could be Cache event so we need to write metrics
+        * into additional MSRs
+        */
+       p4_pmu_enable_pebs(hwc->config);
+
+       (void)wrmsrl_safe(escr_addr, escr_conf);
+       (void)wrmsrl_safe(hwc->config_base,
+                               (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               p4_pmu_enable_event(event);
+       }
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int idx, handled = 0;
+       u64 val;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               int overflow;
+
+               if (!test_bit(idx, cpuc->active_mask)) {
+                       /* catch in-flight IRQs */
+                       if (__test_and_clear_bit(idx, cpuc->running))
+                               handled++;
+                       continue;
+               }
+
+               event = cpuc->events[idx];
+               hwc = &event->hw;
+
+               WARN_ON_ONCE(hwc->idx != idx);
+
+               /* it might be unflagged overflow */
+               overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+               val = x86_perf_event_update(event);
+               if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+                       continue;
+
+               handled += overflow;
+
+               /* event overflow for sure */
+               perf_sample_data_init(&data, 0, hwc->last_period);
+
+               if (!x86_perf_event_set_period(event))
+                       continue;
+
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       /*
+        * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+        * been observed that the OVF bit flag has to be cleared first _before_
+        * the LVTPC can be unmasked.
+        *
+        * The reason is the NMI line will continue to be asserted while the OVF
+        * bit is set.  This causes a second NMI to generate if the LVTPC is
+        * unmasked before the OVF bit is cleared, leading to unknown NMI
+        * messages.
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+       return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+       u32 escr, cccr;
+
+       /*
+        * we either lucky and continue on same cpu or no HT support
+        */
+       if (!p4_should_swap_ts(hwc->config, cpu))
+               return;
+
+       /*
+        * the event is migrated from an another logical
+        * cpu, so we need to swap thread specific flags
+        */
+
+       escr = p4_config_unpack_escr(hwc->config);
+       cccr = p4_config_unpack_cccr(hwc->config);
+
+       if (p4_ht_thread(cpu)) {
+               cccr &= ~P4_CCCR_OVF_PMI_T0;
+               cccr |= P4_CCCR_OVF_PMI_T1;
+               if (escr & P4_ESCR_T0_OS) {
+                       escr &= ~P4_ESCR_T0_OS;
+                       escr |= P4_ESCR_T1_OS;
+               }
+               if (escr & P4_ESCR_T0_USR) {
+                       escr &= ~P4_ESCR_T0_USR;
+                       escr |= P4_ESCR_T1_USR;
+               }
+               hwc->config  = p4_config_pack_escr(escr);
+               hwc->config |= p4_config_pack_cccr(cccr);
+               hwc->config |= P4_CONFIG_HT;
+       } else {
+               cccr &= ~P4_CCCR_OVF_PMI_T1;
+               cccr |= P4_CCCR_OVF_PMI_T0;
+               if (escr & P4_ESCR_T1_OS) {
+                       escr &= ~P4_ESCR_T1_OS;
+                       escr |= P4_ESCR_T0_OS;
+               }
+               if (escr & P4_ESCR_T1_USR) {
+                       escr &= ~P4_ESCR_T1_USR;
+                       escr |= P4_ESCR_T0_USR;
+               }
+               hwc->config  = p4_config_pack_escr(escr);
+               hwc->config |= p4_config_pack_cccr(cccr);
+               hwc->config &= ~P4_CONFIG_HT;
+       }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE               0x000003a0
+#define P4_ESCR_MSR_MAX                        0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE         (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)           (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)   [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+       unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+       if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE      ||
+                       !p4_escr_table[idx]             ||
+                       p4_escr_table[idx] != addr)) {
+               WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+               return -1;
+       }
+
+       return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+                       struct p4_event_bind *bind)
+{
+       int i, j;
+
+       for (i = 0; i < P4_CNTR_LIMIT; i++) {
+               j = bind->cntr[thread][i];
+               if (j != -1 && !test_bit(j, used_mask))
+                       return j;
+       }
+
+       return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+       int cpu = smp_processor_id();
+       struct hw_perf_event *hwc;
+       struct p4_event_bind *bind;
+       unsigned int i, thread, num;
+       int cntr_idx, escr_idx;
+       u64 config_alias;
+       int pass;
+
+       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+       bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+       for (i = 0, num = n; i < n; i++, num--) {
+
+               hwc = &cpuc->event_list[i]->hw;
+               thread = p4_ht_thread(cpu);
+               pass = 0;
+
+again:
+               /*
+                * It's possible to hit a circular lock
+                * between original and alternative events
+                * if both are scheduled already.
+                */
+               if (pass > 2)
+                       goto done;
+
+               bind = p4_config_get_bind(hwc->config);
+               escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+               if (unlikely(escr_idx == -1))
+                       goto done;
+
+               if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+                       cntr_idx = hwc->idx;
+                       if (assign)
+                               assign[i] = hwc->idx;
+                       goto reserve;
+               }
+
+               cntr_idx = p4_next_cntr(thread, used_mask, bind);
+               if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+                       /*
+                        * Check whether an event alias is still available.
+                        */
+                       config_alias = p4_get_alias_event(hwc->config);
+                       if (!config_alias)
+                               goto done;
+                       hwc->config = config_alias;
+                       pass++;
+                       goto again;
+               }
+               /*
+                * Perf does test runs to see if a whole group can be assigned
+                * together succesfully.  There can be multiple rounds of this.
+                * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+                * bits, such that the next round of group assignments will
+                * cause the above p4_should_swap_ts to pass instead of fail.
+                * This leads to counters exclusive to thread0 being used by
+                * thread1.
+                *
+                * Solve this with a cheap hack, reset the idx back to -1 to
+                * force a new lookup (p4_next_cntr) to get the right counter
+                * for the right thread.
+                *
+                * This probably doesn't comply with the general spirit of how
+                * perf wants to work, but P4 is special. :-(
+                */
+               if (p4_should_swap_ts(hwc->config, cpu))
+                       hwc->idx = -1;
+               p4_pmu_swap_config_ts(hwc, cpu);
+               if (assign)
+                       assign[i] = cntr_idx;
+reserve:
+               set_bit(cntr_idx, used_mask);
+               set_bit(escr_idx, escr_mask);
+       }
+
+done:
+       return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht,   "config:63"   );
+
+static struct attribute *intel_p4_formats_attr[] = {
+       &format_attr_cccr.attr,
+       &format_attr_escr.attr,
+       &format_attr_ht.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+       .name                   = "Netburst P4/Xeon",
+       .handle_irq             = p4_pmu_handle_irq,
+       .disable_all            = p4_pmu_disable_all,
+       .enable_all             = p4_pmu_enable_all,
+       .enable                 = p4_pmu_enable_event,
+       .disable                = p4_pmu_disable_event,
+       .eventsel               = MSR_P4_BPU_CCCR0,
+       .perfctr                = MSR_P4_BPU_PERFCTR0,
+       .event_map              = p4_pmu_event_map,
+       .max_events             = ARRAY_SIZE(p4_general_events),
+       .get_event_constraints  = x86_get_event_constraints,
+       /*
+        * IF HT disabled we may need to use all
+        * ARCH_P4_MAX_CCCR counters simulaneously
+        * though leave it restricted at moment assuming
+        * HT is on
+        */
+       .num_counters           = ARCH_P4_MAX_CCCR,
+       .apic                   = 1,
+       .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
+       .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
+       .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+       .hw_config              = p4_hw_config,
+       .schedule_events        = p4_pmu_schedule_events,
+       /*
+        * This handles erratum N15 in intel doc 249199-029,
+        * the counter may not be updated correctly on write
+        * so we need a second write operation to do the trick
+        * (the official workaround didn't work)
+        *
+        * the former idea is taken from OProfile code
+        */
+       .perfctr_second_write   = 1,
+
+       .format_attrs           = intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+       unsigned int low, high;
+       int i, reg;
+
+       /* If we get stripped -- indexing fails */
+       BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+       rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+       if (!(low & (1 << 7))) {
+               pr_cont("unsupported Netburst CPU model %d ",
+                       boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+               sizeof(hw_cache_event_ids));
+
+       pr_cont("Netburst events, ");
+
+       x86_pmu = p4_pmu;
+
+       /*
+        * Even though the counters are configured to interrupt a particular
+        * logical processor when an overflow happens, testing has shown that
+        * on kdump kernels (which uses a single cpu), thread1's counter
+        * continues to run and will report an NMI on thread0.  Due to the
+        * overflow bug, this leads to a stream of unknown NMIs.
+        *
+        * Solve this by zero'ing out the registers to mimic a reset.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu_config_addr(i);
+               wrmsrl_safe(reg, 0ULL);
+       }
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c

new file mode 100644 (file)

index 0000000..1f5c47a
--- /dev/null
+++ b/arch/x86/events/intel/p6.c
@@ -0,0 +1,279 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,       /* CPU_CLK_UNHALTED */
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,       /* INST_RETIRED     */
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,       /* L2_RQSTS:M:E:S:I */
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,       /* L2_RQSTS:I       */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,       /* BR_INST_RETIRED  */
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,       /* BR_MISS_PRED_RETIRED */
+  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,       /* BUS_DRDY_CLOCKS  */
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,    /* RESOURCE_STALLS  */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS       */
+                [ C(RESULT_MISS)   ] = 0x0045, /* DCU_LINES_IN        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0f29,  /* L2_LD:M:E:S:I       */
+       },
+        [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+        },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
+               [ C(RESULT_MISS)   ] = 0x0f28,  /* L2_IFETCH:M:E:S:I  */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0025,  /* L2_M_LINES_INM     */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
+               [ C(RESULT_MISS)   ] = 0x0085,  /* ITLB_MISS          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4,  /* BR_INST_RETIRED      */
+               [ C(RESULT_MISS)   ] = 0x00c5,  /* BR_MISS_PRED_RETIRED */
+        },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+       return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT                   0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+       INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
+       INTEL_EVENT_CONSTRAINT(0x10, 0x1),      /* FP_COMP_OPS_EXE */
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2),      /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2),      /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2),      /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1),      /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+       u64 val;
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+       unsigned long val;
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val = P6_NOP_EVENT;
+
+       (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+
+       /*
+        * p6 only has a global event enable, set on PerfEvtSel0
+        * We "disable" events by programming P6_NOP_EVENT
+        * and we rely on p6_pmu_enable_all() being called
+        * to actually enable the events.
+        */
+
+       (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(pc,    "config:19"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *intel_p6_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+       .name                   = "p6",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = p6_pmu_disable_all,
+       .enable_all             = p6_pmu_enable_all,
+       .enable                 = p6_pmu_enable_event,
+       .disable                = p6_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_P6_EVNTSEL0,
+       .perfctr                = MSR_P6_PERFCTR0,
+       .event_map              = p6_pmu_event_map,
+       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 31) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       /*
+        * Events have 40 bits implemented. However they are designed such
+        * that bits [32-39] are sign extensions of bit 31. As such the
+        * effective width of a event for P6-like PMU is 32 bits only.
+        *
+        * See IA-32 Intel Architecture Software developer manual Vol 3B
+        */
+       .cntval_bits            = 32,
+       .cntval_mask            = (1ULL << 32) - 1,
+       .get_event_constraints  = x86_get_event_constraints,
+       .event_constraints      = p6_event_constraints,
+
+       .format_attrs           = intel_p6_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+       if (boot_cpu_data.x86_mask < 9) {
+               /*
+                * PPro erratum 26; fixed in stepping 9 and above.
+                */
+               pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+               x86_pmu.attr_rdpmc_broken = 1;
+               x86_pmu.attr_rdpmc = 0;
+       }
+}
+
+__init int p6_pmu_init(void)
+{
+       x86_pmu = p6_pmu;
+
+       switch (boot_cpu_data.x86_model) {
+       case  1: /* Pentium Pro */
+               x86_add_quirk(p6_pmu_rdpmc_quirk);
+               break;
+
+       case  3: /* Pentium II - Klamath */
+       case  5: /* Pentium II - Deschutes */
+       case  6: /* Pentium II - Mendocino */
+               break;
+
+       case  7: /* Pentium III - Katmai */
+       case  8: /* Pentium III - Coppermine */
+       case 10: /* Pentium III Xeon */
+       case 11: /* Pentium III - Tualatin */
+               break;
+
+       case  9: /* Pentium M - Banias */
+       case 13: /* Pentium M - Dothan */
+               break;
+
+       default:
+               pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+               sizeof(hw_cache_event_ids));
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c

new file mode 100644 (file)

index 0000000..6af7cf7
--- /dev/null
+++ b/arch/x86/events/intel/pt.c
@@ -0,0 +1,1188 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+
+#include "../perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+       CR_EAX = 0,
+       CR_ECX,
+       CR_EDX,
+       CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)                                         \
+       [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
+                           .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+       const char      *name;
+       u32             leaf;
+       u8              reg;
+       u32             mask;
+} pt_caps[] = {
+       PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
+       PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
+       PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
+       PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
+       PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
+       PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
+       PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
+       PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
+       PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
+       PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
+       PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+       struct pt_cap_desc *cd = &pt_caps[cap];
+       u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+       unsigned int shift = __ffs(cd->mask);
+
+       return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct dev_ext_attribute *ea =
+               container_of(attr, struct dev_ext_attribute, attr);
+       enum pt_capabilities cap = (long)ea->var;
+
+       return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+       .name   = "caps",
+};
+
+PMU_FORMAT_ATTR(cyc,           "config:1"      );
+PMU_FORMAT_ATTR(mtc,           "config:9"      );
+PMU_FORMAT_ATTR(tsc,           "config:10"     );
+PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
+PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
+PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
+PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
+
+static struct attribute *pt_formats_attr[] = {
+       &format_attr_cyc.attr,
+       &format_attr_mtc.attr,
+       &format_attr_tsc.attr,
+       &format_attr_noretcomp.attr,
+       &format_attr_mtc_period.attr,
+       &format_attr_cyc_thresh.attr,
+       &format_attr_psb_period.attr,
+       NULL,
+};
+
+static struct attribute_group pt_format_group = {
+       .name   = "format",
+       .attrs  = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+       &pt_cap_group,
+       &pt_format_group,
+       NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+       struct dev_ext_attribute *de_attrs;
+       struct attribute **attrs;
+       size_t size;
+       int ret;
+       long i;
+
+       attrs = NULL;
+
+       for (i = 0; i < PT_CPUID_LEAVES; i++) {
+               cpuid_count(20, i,
+                           &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
+       }
+
+       ret = -ENOMEM;
+       size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+       attrs = kzalloc(size, GFP_KERNEL);
+       if (!attrs)
+               goto fail;
+
+       size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+       de_attrs = kzalloc(size, GFP_KERNEL);
+       if (!de_attrs)
+               goto fail;
+
+       for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+               struct dev_ext_attribute *de_attr = de_attrs + i;
+
+               de_attr->attr.attr.name = pt_caps[i].name;
+
+               sysfs_attr_init(&de_attr->attr.attr);
+
+               de_attr->attr.attr.mode         = S_IRUGO;
+               de_attr->attr.show              = pt_cap_show;
+               de_attr->var                    = (void *)i;
+
+               attrs[i] = &de_attr->attr.attr;
+       }
+
+       pt_cap_group.attrs = attrs;
+
+       return 0;
+
+fail:
+       kfree(attrs);
+
+       return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC    | \
+                         RTIT_CTL_CYC_THRESH   | \
+                         RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC   (RTIT_CTL_MTC_EN        | \
+                        RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
+                       RTIT_CTL_DISRETC        | \
+                       RTIT_CTL_CYC_PSB        | \
+                       RTIT_CTL_MTC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+       u64 config = event->attr.config;
+       u64 allowed, requested;
+
+       if ((config & PT_CONFIG_MASK) != config)
+               return false;
+
+       if (config & RTIT_CTL_CYC_PSB) {
+               if (!pt_cap_get(PT_CAP_psb_cyc))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_psb_periods);
+               requested = (config & RTIT_CTL_PSB_FREQ) >>
+                       RTIT_CTL_PSB_FREQ_OFFSET;
+               if (requested && (!(allowed & BIT(requested))))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+               requested = (config & RTIT_CTL_CYC_THRESH) >>
+                       RTIT_CTL_CYC_THRESH_OFFSET;
+               if (requested && (!(allowed & BIT(requested))))
+                       return false;
+       }
+
+       if (config & RTIT_CTL_MTC) {
+               /*
+                * In the unlikely case that CPUID lists valid mtc periods,
+                * but not the mtc capability, drop out here.
+                *
+                * Spec says that setting mtc period bits while mtc bit in
+                * CPUID is 0 will #GP, so better safe than sorry.
+                */
+               if (!pt_cap_get(PT_CAP_mtc))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_mtc_periods);
+               if (!allowed)
+                       return false;
+
+               requested = (config & RTIT_CTL_MTC_RANGE) >>
+                       RTIT_CTL_MTC_RANGE_OFFSET;
+
+               if (!(allowed & BIT(requested)))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static void pt_config(struct perf_event *event)
+{
+       u64 reg;
+
+       if (!event->hw.itrace_started) {
+               event->hw.itrace_started = 1;
+               wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+       }
+
+       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+       if (!event->attr.exclude_kernel)
+               reg |= RTIT_CTL_OS;
+       if (!event->attr.exclude_user)
+               reg |= RTIT_CTL_USR;
+
+       reg |= (event->attr.config & PT_CONFIG_MASK);
+
+       wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+       u64 ctl;
+
+       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+       if (start)
+               ctl |= RTIT_CTL_TRACEEN;
+       else
+               ctl &= ~RTIT_CTL_TRACEEN;
+       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+       /*
+        * A wrmsr that disables trace generation serializes other PT
+        * registers and causes all data packets to be written to memory,
+        * but a fence is required for the data to become globally visible.
+        *
+        * The below WMB, separating data store and aux_head store matches
+        * the consumer's RMB that separates aux_head load and data load.
+        */
+       if (!start)
+               wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+                            unsigned int output_off)
+{
+       u64 reg;
+
+       wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+       reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+       wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:     actual ToPA table entries, as understood by PT hardware
+ * @list:      linkage to struct pt_buffer's list of tables
+ * @phys:      physical address of this page
+ * @offset:    offset of the first entry in this table in the buffer
+ * @size:      total size of all entries in this table
+ * @last:      index of the last initialized entry in this table
+ */
+struct topa {
+       struct topa_entry       table[TENTS_PER_PAGE];
+       struct list_head        list;
+       u64                     phys;
+       u64                     offset;
+       size_t                  size;
+       int                     last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:       CPU on which to allocate.
+ * @gfp:       Allocation flags.
+ *
+ * Return:     On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+       int node = cpu_to_node(cpu);
+       struct topa *topa;
+       struct page *p;
+
+       p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+       if (!p)
+               return NULL;
+
+       topa = page_address(p);
+       topa->last = 0;
+       topa->phys = page_to_phys(p);
+
+       /*
+        * In case of singe-entry ToPA, always put the self-referencing END
+        * link as the 2nd entry in the table
+        */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+               TOPA_ENTRY(topa, 1)->end = 1;
+       }
+
+       return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:      Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+       free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:        PT buffer that's being extended.
+ * @topa:       New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+       struct topa *last = buf->last;
+
+       list_add_tail(&topa->list, &buf->tables);
+
+       if (!buf->first) {
+               buf->first = buf->last = buf->cur = topa;
+               return;
+       }
+
+       topa->offset = last->offset + last->size;
+       buf->last = topa;
+
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return;
+
+       BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+       TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+       TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:      ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+       /* single-entry ToPA is a special case */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return !!topa->last;
+
+       return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:       PT buffer being initialized.
+ * @gfp:       Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:     0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+       struct topa *topa = buf->last;
+       int order = 0;
+       struct page *p;
+
+       p = virt_to_page(buf->data_pages[buf->nr_pages]);
+       if (PagePrivate(p))
+               order = page_private(p);
+
+       if (topa_table_full(topa)) {
+               topa = topa_alloc(buf->cpu, gfp);
+               if (!topa)
+                       return -ENOMEM;
+
+               topa_insert_table(buf, topa);
+       }
+
+       TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+       TOPA_ENTRY(topa, -1)->size = order;
+       if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(topa, -1)->intr = 1;
+               TOPA_ENTRY(topa, -1)->stop = 1;
+       }
+
+       topa->last++;
+       topa->size += sizes(order);
+
+       buf->nr_pages += 1ul << order;
+
+       return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:       PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+       struct topa *topa;
+
+       list_for_each_entry(topa, &buf->tables, list) {
+               int i;
+
+               pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+                        topa->phys, topa->offset, topa->size);
+               for (i = 0; i < TENTS_PER_PAGE; i++) {
+                       pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+                                &topa->table[i],
+                                (unsigned long)topa->table[i].base << TOPA_SHIFT,
+                                sizes(topa->table[i].size),
+                                topa->table[i].end ?  'E' : ' ',
+                                topa->table[i].intr ? 'I' : ' ',
+                                topa->table[i].stop ? 'S' : ' ',
+                                *(u64 *)&topa->table[i]);
+                       if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+                            topa->table[i].stop) ||
+                           topa->table[i].end)
+                               break;
+               }
+       }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:       PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+       buf->output_off = 0;
+       buf->cur_idx++;
+
+       if (buf->cur_idx == buf->cur->last) {
+               if (buf->cur == buf->last)
+                       buf->cur = buf->first;
+               else
+                       buf->cur = list_entry(buf->cur->list.next, struct topa,
+                                             list);
+               buf->cur_idx = 0;
+       }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:                Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+       u64 topa_idx, base, old;
+
+       /* offset of the first region in this table from the beginning of buf */
+       base = buf->cur->offset + buf->output_off;
+
+       /* offset of the current output region within this table */
+       for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+               base += sizes(buf->cur->table[topa_idx].size);
+
+       if (buf->snapshot) {
+               local_set(&buf->data_size, base);
+       } else {
+               old = (local64_xchg(&buf->head, base) &
+                      ((buf->nr_pages << PAGE_SHIFT) - 1));
+               if (base < old)
+                       base += buf->nr_pages << PAGE_SHIFT;
+
+               local_add(base - old, &buf->data_size);
+       }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:       PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+       return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:       PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+       return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:                Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+       int advance = 0;
+       u64 status;
+
+       rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+       if (status & RTIT_STATUS_ERROR) {
+               pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+               pt_topa_dump(buf);
+               status &= ~RTIT_STATUS_ERROR;
+       }
+
+       if (status & RTIT_STATUS_STOPPED) {
+               status &= ~RTIT_STATUS_STOPPED;
+
+               /*
+                * On systems that only do single-entry ToPA, hitting STOP
+                * means we are already losing data; need to let the decoder
+                * know.
+                */
+               if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+                   buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+                       local_inc(&buf->lost);
+                       advance++;
+               }
+       }
+
+       /*
+        * Also on single-entry ToPA implementations, interrupt will come
+        * before the output reaches its output region's boundary.
+        */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+           pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+               void *head = pt_buffer_region(buf);
+
+               /* everything within this margin needs to be zeroed out */
+               memset(head + buf->output_off, 0,
+                      pt_buffer_region_size(buf) -
+                      buf->output_off);
+               advance++;
+       }
+
+       if (advance)
+               pt_buffer_advance(buf);
+
+       wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:       PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+       u64 offset, base_topa;
+
+       rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+       buf->cur = phys_to_virt(base_topa);
+
+       rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+       /* offset within current output region */
+       buf->output_off = offset >> 32;
+       /* index of current output region within this table */
+       buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:       PT buffer.
+ * @pg:                Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+       struct topa_entry *te = buf->topa_index[pg];
+
+       /* one region */
+       if (buf->first == buf->last && buf->first->last == 1)
+               return pg;
+
+       do {
+               pg++;
+               pg &= buf->nr_pages - 1;
+       } while (buf->topa_index[pg] == te);
+
+       return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:       PT buffer.
+ * @handle:    Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+                                  struct perf_output_handle *handle)
+
+{
+       unsigned long head = local64_read(&buf->head);
+       unsigned long idx, npages, wakeup;
+
+       /* can't stop in the middle of an output region */
+       if (buf->output_off + handle->size + 1 <
+           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+               return -EINVAL;
+
+
+       /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return 0;
+
+       /* clear STOP and INT from current entry */
+       buf->topa_index[buf->stop_pos]->stop = 0;
+       buf->topa_index[buf->intr_pos]->intr = 0;
+
+       /* how many pages till the STOP marker */
+       npages = handle->size >> PAGE_SHIFT;
+
+       /* if it's on a page boundary, fill up one more page */
+       if (!offset_in_page(head + handle->size + 1))
+               npages++;
+
+       idx = (head >> PAGE_SHIFT) + npages;
+       idx &= buf->nr_pages - 1;
+       buf->stop_pos = idx;
+
+       wakeup = handle->wakeup >> PAGE_SHIFT;
+
+       /* in the worst case, wake up the consumer one page before hard stop */
+       idx = (head >> PAGE_SHIFT) + npages - 1;
+       if (idx > wakeup)
+               idx = wakeup;
+
+       idx &= buf->nr_pages - 1;
+       buf->intr_pos = idx;
+
+       buf->topa_index[buf->stop_pos]->stop = 1;
+       buf->topa_index[buf->intr_pos]->intr = 1;
+
+       return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:       PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+       struct topa *cur = buf->first, *prev = buf->last;
+       struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+               *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+       int pg = 0, idx = 0;
+
+       while (pg < buf->nr_pages) {
+               int tidx;
+
+               /* pages within one topa entry */
+               for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+                       buf->topa_index[pg] = te_prev;
+
+               te_prev = te_cur;
+
+               if (idx == cur->last - 1) {
+                       /* advance to next topa table */
+                       idx = 0;
+                       cur = list_entry(cur->list.next, struct topa, list);
+               } else {
+                       idx++;
+               }
+               te_cur = TOPA_ENTRY(cur, idx);
+       }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:       PT buffer.
+ * @head:      Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+       int pg;
+
+       if (buf->snapshot)
+               head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+       pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+       pg = pt_topa_next_entry(buf, pg);
+
+       buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+       buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+                       (unsigned long)buf->cur) / sizeof(struct topa_entry);
+       buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+       local64_set(&buf->head, head);
+       local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:       PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+       struct topa *topa, *iter;
+
+       list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+               /*
+                * right now, this is in free_aux() path only, so
+                * no need to unlink this table from the list
+                */
+               topa_free(topa);
+       }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:       PT buffer.
+ * @size:      Total size of all regions within this ToPA.
+ * @gfp:       Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+                              gfp_t gfp)
+{
+       struct topa *topa;
+       int err;
+
+       topa = topa_alloc(buf->cpu, gfp);
+       if (!topa)
+               return -ENOMEM;
+
+       topa_insert_table(buf, topa);
+
+       while (buf->nr_pages < nr_pages) {
+               err = topa_insert_pages(buf, gfp);
+               if (err) {
+                       pt_buffer_fini_topa(buf);
+                       return -ENOMEM;
+               }
+       }
+
+       pt_buffer_setup_topa_index(buf);
+
+       /* link last table to the first one, unless we're double buffering */
+       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+               TOPA_ENTRY(buf->last, -1)->end = 1;
+       }
+
+       pt_topa_dump(buf);
+       return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:       Cpu on which to allocate, -1 means current.
+ * @pages:     Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:  Number of pages in the buffer.
+ * @snapshot:  If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:     Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+       struct pt_buffer *buf;
+       int node, ret;
+
+       if (!nr_pages)
+               return NULL;
+
+       if (cpu == -1)
+               cpu = raw_smp_processor_id();
+       node = cpu_to_node(cpu);
+
+       buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+                          GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->cpu = cpu;
+       buf->snapshot = snapshot;
+       buf->data_pages = pages;
+
+       INIT_LIST_HEAD(&buf->tables);
+
+       ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+       if (ret) {
+               kfree(buf);
+               return NULL;
+       }
+
+       return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:      PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+       struct pt_buffer *buf = data;
+
+       pt_buffer_fini_topa(buf);
+       kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:       PT buffer.
+ * @pt:                Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+       if (buf->snapshot)
+               return false;
+
+       if (local_read(&buf->data_size) >= pt->handle.size)
+               return true;
+
+       return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf;
+       struct perf_event *event = pt->handle.event;
+
+       /*
+        * There may be a dangling PT bit in the interrupt status register
+        * after PT has been disabled by pt_event_stop(). Make sure we don't
+        * do anything (particularly, re-enable) for this event here.
+        */
+       if (!ACCESS_ONCE(pt->handle_nmi))
+               return;
+
+       pt_config_start(false);
+
+       if (!event)
+               return;
+
+       buf = perf_get_aux(&pt->handle);
+       if (!buf)
+               return;
+
+       pt_read_offset(buf);
+
+       pt_handle_status(pt);
+
+       pt_update_head(pt);
+
+       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+                           local_xchg(&buf->lost, 0));
+
+       if (!event->hw.state) {
+               int ret;
+
+               buf = perf_aux_output_begin(&pt->handle, event);
+               if (!buf) {
+                       event->hw.state = PERF_HES_STOPPED;
+                       return;
+               }
+
+               pt_buffer_reset_offsets(buf, pt->handle.head);
+               /* snapshot counters don't use PMI, so it's safe */
+               ret = pt_buffer_reset_markers(buf, &pt->handle);
+               if (ret) {
+                       perf_aux_output_end(&pt->handle, 0, true);
+                       return;
+               }
+
+               pt_config_buffer(buf->cur->table, buf->cur_idx,
+                                buf->output_off);
+               pt_config(event);
+       }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+       if (!buf || pt_buffer_is_full(buf, pt)) {
+               event->hw.state = PERF_HES_STOPPED;
+               return;
+       }
+
+       ACCESS_ONCE(pt->handle_nmi) = 1;
+       event->hw.state = 0;
+
+       pt_config_buffer(buf->cur->table, buf->cur_idx,
+                        buf->output_off);
+       pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+       /*
+        * Protect against the PMI racing with disabling wrmsr,
+        * see comment in intel_pt_interrupt().
+        */
+       ACCESS_ONCE(pt->handle_nmi) = 0;
+       pt_config_start(false);
+
+       if (event->hw.state == PERF_HES_STOPPED)
+               return;
+
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_UPDATE) {
+               struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+               if (!buf)
+                       return;
+
+               if (WARN_ON_ONCE(pt->handle.event != event))
+                       return;
+
+               pt_read_offset(buf);
+
+               pt_handle_status(pt);
+
+               pt_update_head(pt);
+       }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf;
+
+       pt_event_stop(event, PERF_EF_UPDATE);
+
+       buf = perf_get_aux(&pt->handle);
+
+       if (buf) {
+               if (buf->snapshot)
+                       pt->handle.head =
+                               local_xchg(&buf->data_size,
+                                          buf->nr_pages << PAGE_SHIFT);
+               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+                                   local_xchg(&buf->lost, 0));
+       }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+       struct pt_buffer *buf;
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = -EBUSY;
+
+       if (pt->handle.event)
+               goto fail;
+
+       buf = perf_aux_output_begin(&pt->handle, event);
+       ret = -EINVAL;
+       if (!buf)
+               goto fail_stop;
+
+       pt_buffer_reset_offsets(buf, pt->handle.head);
+       if (!buf->snapshot) {
+               ret = pt_buffer_reset_markers(buf, &pt->handle);
+               if (ret)
+                       goto fail_end_stop;
+       }
+
+       if (mode & PERF_EF_START) {
+               pt_event_start(event, 0);
+               ret = -EBUSY;
+               if (hwc->state == PERF_HES_STOPPED)
+                       goto fail_end_stop;
+       } else {
+               hwc->state = PERF_HES_STOPPED;
+       }
+
+       return 0;
+
+fail_end_stop:
+       perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+       hwc->state = PERF_HES_STOPPED;
+fail:
+       return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+       x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+       if (event->attr.type != pt_pmu.pmu.type)
+               return -ENOENT;
+
+       if (!pt_event_valid(event))
+               return -EINVAL;
+
+       if (x86_add_exclusive(x86_lbr_exclusive_pt))
+               return -EBUSY;
+
+       event->destroy = pt_event_destroy;
+
+       return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+       if (pt->handle.event)
+               pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+static __init int pt_init(void)
+{
+       int ret, cpu, prior_warn = 0;
+
+       BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+       if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+               return -ENODEV;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               u64 ctl;
+
+               ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+               if (!ret && (ctl & RTIT_CTL_TRACEEN))
+                       prior_warn++;
+       }
+       put_online_cpus();
+
+       if (prior_warn) {
+               x86_add_exclusive(x86_lbr_exclusive_pt);
+               pr_warn("PT is enabled at boot time, doing nothing\n");
+
+               return -EBUSY;
+       }
+
+       ret = pt_pmu_hw_init();
+       if (ret)
+               return ret;
+
+       if (!pt_cap_get(PT_CAP_topa_output)) {
+               pr_warn("ToPA output is not supported on this CPU\n");
+               return -ENODEV;
+       }
+
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               pt_pmu.pmu.capabilities =
+                       PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+       pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+       pt_pmu.pmu.attr_groups  = pt_attr_groups;
+       pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
+       pt_pmu.pmu.event_init   = pt_event_init;
+       pt_pmu.pmu.add          = pt_event_add;
+       pt_pmu.pmu.del          = pt_event_del;
+       pt_pmu.pmu.start        = pt_event_start;
+       pt_pmu.pmu.stop         = pt_event_stop;
+       pt_pmu.pmu.read         = pt_event_read;
+       pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
+       pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
+       ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+       return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h

new file mode 100644 (file)

index 0000000..336878a
--- /dev/null
+++ b/arch/x86/events/intel/pt.h
@@ -0,0 +1,116 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+       return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+       u64     end     : 1;
+       u64     rsvd0   : 1;
+       u64     intr    : 1;
+       u64     rsvd1   : 1;
+       u64     stop    : 1;
+       u64     rsvd2   : 1;
+       u64     size    : 4;
+       u64     rsvd3   : 2;
+       u64     base    : 36;
+       u64     rsvd4   : 16;
+};
+
+#define PT_CPUID_LEAVES                2
+#define PT_CPUID_REGS_NUM      4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+       PT_CAP_max_subleaf = 0,
+       PT_CAP_cr3_filtering,
+       PT_CAP_psb_cyc,
+       PT_CAP_mtc,
+       PT_CAP_topa_output,
+       PT_CAP_topa_multiple_entries,
+       PT_CAP_single_range_output,
+       PT_CAP_payloads_lip,
+       PT_CAP_mtc_periods,
+       PT_CAP_cycle_thresholds,
+       PT_CAP_psb_periods,
+};
+
+struct pt_pmu {
+       struct pmu              pmu;
+       u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *             cpu, depending on perf event configuration
+ * @cpu:       cpu for per-cpu allocation
+ * @tables:    list of ToPA tables in this buffer
+ * @first:     shorthand for first topa table
+ * @last:      shorthand for last topa table
+ * @cur:       current topa table
+ * @nr_pages:  buffer size in pages
+ * @cur_idx:   current output region's index within @cur table
+ * @output_off:        offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost:      if data was lost/truncated
+ * @head:      logical write offset inside the buffer
+ * @snapshot:  if this is for a snapshot/overwrite counter
+ * @stop_pos:  STOP topa entry in the buffer
+ * @intr_pos:  INT topa entry in the buffer
+ * @data_pages:        array of pages from perf
+ * @topa_index:        table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+       int                     cpu;
+       struct list_head        tables;
+       struct topa             *first, *last, *cur;
+       unsigned int            cur_idx;
+       size_t                  output_off;
+       unsigned long           nr_pages;
+       local_t                 data_size;
+       local_t                 lost;
+       local64_t               head;
+       bool                    snapshot;
+       unsigned long           stop_pos, intr_pos;
+       void                    **data_pages;
+       struct topa_entry       *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:    perf output handle
+ * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+       struct perf_output_handle handle;
+       int                     handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c

new file mode 100644 (file)

index 0000000..b834a3f
--- /dev/null
+++ b/arch/x86/events/intel/rapl.c
@@ -0,0 +1,767 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ *       event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *       event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *       event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *       event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT  0       /* all cores */
+#define INTEL_RAPL_PP0         0x1     /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT  1       /* entire package */
+#define INTEL_RAPL_PKG         0x2     /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT  2       /* DRAM */
+#define INTEL_RAPL_RAM         0x3     /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT  3       /* gpu */
+#define INTEL_RAPL_PP1         0x4     /* pseudo-encoding */
+
+#define NR_RAPL_DOMAINS         0x4
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+       "pp0-core",
+       "package",
+       "dram",
+       "pp1-gpu",
+};
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT|\
+                        1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL   (1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK        0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)          \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,      \
+                               struct kobj_attribute *attr,    \
+                               char *page)                     \
+{                                                              \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+       return sprintf(page, _format "\n");                     \
+}                                                              \
+static struct kobj_attribute format_attr_##_var =              \
+       __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_CNTR_WIDTH 32
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
+static struct perf_pmu_events_attr event_attr_##v = {                          \
+       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
+       .id             = 0,                                                    \
+       .event_str      = str,                                                  \
+};
+
+struct rapl_pmu {
+       raw_spinlock_t          lock;
+       int                     n_active;
+       int                     cpu;
+       struct list_head        active_list;
+       struct pmu              *pmu;
+       ktime_t                 timer_interval;
+       struct hrtimer          hrtimer;
+};
+
+struct rapl_pmus {
+       struct pmu              pmu;
+       unsigned int            maxpkg;
+       struct rapl_pmu         *pmus[];
+};
+
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus;
+static cpumask_t rapl_cpu_mask;
+static unsigned int rapl_cntr_mask;
+static u64 rapl_timer_ms;
+
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+       return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+}
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+       u64 raw;
+       rdmsrl(event->hw.event_base, raw);
+       return raw;
+}
+
+static inline u64 rapl_scale(u64 v, int cfg)
+{
+       if (cfg > NR_RAPL_DOMAINS) {
+               pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+               return v;
+       }
+       /*
+        * scale delta to smallest unit (1/2^32)
+        * users must then scale back: count * 1/(1e9*2^32) to get Joules
+        * or use ldexp(count, -32).
+        * Watts = Joules/Time delta
+        */
+       return v << (32 - rapl_hw_unit[cfg - 1]);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta, sdelta;
+       int shift = RAPL_CNTR_WIDTH;
+
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       rdmsrl(event->hw.event_base, new_raw_count);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                           new_raw_count) != prev_raw_count) {
+               cpu_relax();
+               goto again;
+       }
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       sdelta = rapl_scale(delta, event->hw.config);
+
+       local64_add(sdelta, &event->count);
+
+       return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+                    HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+       struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+       struct perf_event *event;
+       unsigned long flags;
+
+       if (!pmu->n_active)
+               return HRTIMER_NORESTART;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       list_for_each_entry(event, &pmu->active_list, active_entry)
+               rapl_event_update(event);
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+       hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+       return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+       struct hrtimer *hr = &pmu->hrtimer;
+
+       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+                                  struct perf_event *event)
+{
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       event->hw.state = 0;
+
+       list_add_tail(&event->active_entry, &pmu->active_list);
+
+       local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+       pmu->n_active++;
+       if (pmu->n_active == 1)
+               rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+       __rapl_pmu_event_start(pmu, event);
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       /* mark event as deactivated and stopped */
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               WARN_ON_ONCE(pmu->n_active <= 0);
+               pmu->n_active--;
+               if (pmu->n_active == 0)
+                       hrtimer_cancel(&pmu->hrtimer);
+
+               list_del(&event->active_entry);
+
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       /* check if update of sw counter is necessary */
+       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               rapl_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_START)
+               __rapl_pmu_event_start(pmu, event);
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+       return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+       int bit, msr, ret = 0;
+       struct rapl_pmu *pmu;
+
+       /* only look at RAPL events */
+       if (event->attr.type != rapl_pmus->pmu.type)
+               return -ENOENT;
+
+       /* check only supported bits are set */
+       if (event->attr.config & ~RAPL_EVENT_MASK)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       /*
+        * check event is known (determines counter)
+        */
+       switch (cfg) {
+       case INTEL_RAPL_PP0:
+               bit = RAPL_IDX_PP0_NRG_STAT;
+               msr = MSR_PP0_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_PKG:
+               bit = RAPL_IDX_PKG_NRG_STAT;
+               msr = MSR_PKG_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_RAM:
+               bit = RAPL_IDX_RAM_NRG_STAT;
+               msr = MSR_DRAM_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_PP1:
+               bit = RAPL_IDX_PP1_NRG_STAT;
+               msr = MSR_PP1_ENERGY_STATUS;
+               break;
+       default:
+               return -EINVAL;
+       }
+       /* check event supported */
+       if (!(rapl_cntr_mask & (1 << bit)))
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       /* must be done before validate_group */
+       pmu = cpu_to_rapl_pmu(event->cpu);
+       event->cpu = pmu->cpu;
+       event->pmu_private = pmu;
+       event->hw.event_base = msr;
+       event->hw.config = cfg;
+       event->hw.idx = bit;
+
+       return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+       rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+       .attrs = rapl_pmu_attrs,
+};
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_gpu),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_gpu_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_gpu_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_gpu),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_gpu_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_gpu_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_knl_attr[] = {
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+       .name = "events",
+       .attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+       .name = "format",
+       .attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+       &rapl_pmu_attr_group,
+       &rapl_pmu_format_group,
+       &rapl_pmu_events_group,
+       NULL,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+       int target;
+
+       /* Check if exiting cpu is used for collecting rapl events */
+       if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+               return;
+
+       pmu->cpu = -1;
+       /* Find a new cpu to collect rapl events */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       /* Migrate rapl events to the new target */
+       if (target < nr_cpu_ids) {
+               cpumask_set_cpu(target, &rapl_cpu_mask);
+               pmu->cpu = target;
+               perf_pmu_migrate_context(pmu->pmu, cpu, target);
+       }
+}
+
+static void rapl_cpu_init(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+       int target;
+
+       /*
+        * Check if there is an online cpu in the package which collects rapl
+        * events already.
+        */
+       target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+       if (target < nr_cpu_ids)
+               return;
+
+       cpumask_set_cpu(cpu, &rapl_cpu_mask);
+       pmu->cpu = cpu;
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+
+       if (pmu)
+               return 0;
+
+       pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+       if (!pmu)
+               return -ENOMEM;
+
+       raw_spin_lock_init(&pmu->lock);
+       INIT_LIST_HEAD(&pmu->active_list);
+       pmu->pmu = &rapl_pmus->pmu;
+       pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+       pmu->cpu = -1;
+       rapl_hrtimer_init(pmu);
+       rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
+       return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+                            unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               rapl_cpu_prepare(cpu);
+               break;
+
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               rapl_cpu_init(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               rapl_cpu_exit(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int rapl_check_hw_unit(bool apply_quirk)
+{
+       u64 msr_rapl_power_unit_bits;
+       int i;
+
+       /* protect rdmsrl() to handle virtualization */
+       if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+               return -1;
+       for (i = 0; i < NR_RAPL_DOMAINS; i++)
+               rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+       /*
+        * DRAM domain on HSW server and KNL has fixed energy unit which can be
+        * different than the unit from power unit MSR. See
+        * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+        * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+        */
+       if (apply_quirk)
+               rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+
+       /*
+        * Calculate the timer rate:
+        * Use reference of 200W for scaling the timeout to avoid counter
+        * overflows. 200W = 200 Joules/sec
+        * Divide interval by 2 to avoid lockstep (2 * 100)
+        * if hw unit is 32, then we use 2 ms 1/200/2
+        */
+       rapl_timer_ms = 2;
+       if (rapl_hw_unit[0] < 32) {
+               rapl_timer_ms = (1000 / (2 * 100));
+               rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+       }
+       return 0;
+}
+
+static void __init rapl_advertise(void)
+{
+       int i;
+
+       pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+               hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+       for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+               if (rapl_cntr_mask & (1 << i)) {
+                       pr_info("hw unit of domain %s 2^-%d Joules\n",
+                               rapl_domain_names[i], rapl_hw_unit[i]);
+               }
+       }
+}
+
+static int __init rapl_prepare_cpus(void)
+{
+       unsigned int cpu, pkg;
+       int ret;
+
+       for_each_online_cpu(cpu) {
+               pkg = topology_logical_package_id(cpu);
+               if (rapl_pmus->pmus[pkg])
+                       continue;
+
+               ret = rapl_cpu_prepare(cpu);
+               if (ret)
+                       return ret;
+               rapl_cpu_init(cpu);
+       }
+       return 0;
+}
+
+static void __init cleanup_rapl_pmus(void)
+{
+       int i;
+
+       for (i = 0; i < rapl_pmus->maxpkg; i++)
+               kfree(rapl_pmus->pmus + i);
+       kfree(rapl_pmus);
+}
+
+static int __init init_rapl_pmus(void)
+{
+       int maxpkg = topology_max_packages();
+       size_t size;
+
+       size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+       rapl_pmus = kzalloc(size, GFP_KERNEL);
+       if (!rapl_pmus)
+               return -ENOMEM;
+
+       rapl_pmus->maxpkg               = maxpkg;
+       rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
+       rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
+       rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
+       rapl_pmus->pmu.add              = rapl_pmu_event_add;
+       rapl_pmus->pmu.del              = rapl_pmu_event_del;
+       rapl_pmus->pmu.start            = rapl_pmu_event_start;
+       rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
+       rapl_pmus->pmu.read             = rapl_pmu_event_read;
+       return 0;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
+       [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+       [1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+       bool apply_quirk = false;
+       int ret;
+
+       if (!x86_match_cpu(rapl_cpu_match))
+               return -ENODEV;
+
+       switch (boot_cpu_data.x86_model) {
+       case 42: /* Sandy Bridge */
+       case 58: /* Ivy Bridge */
+               rapl_cntr_mask = RAPL_IDX_CLN;
+               rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+               break;
+       case 63: /* Haswell-Server */
+               apply_quirk = true;
+               rapl_cntr_mask = RAPL_IDX_SRV;
+               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+               break;
+       case 60: /* Haswell */
+       case 69: /* Haswell-Celeron */
+       case 61: /* Broadwell */
+               rapl_cntr_mask = RAPL_IDX_HSW;
+               rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+               break;
+       case 45: /* Sandy Bridge-EP */
+       case 62: /* IvyTown */
+               rapl_cntr_mask = RAPL_IDX_SRV;
+               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+               break;
+       case 87: /* Knights Landing */
+               apply_quirk = true;
+               rapl_cntr_mask = RAPL_IDX_KNL;
+               rapl_pmu_events_group.attrs = rapl_events_knl_attr;
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       ret = rapl_check_hw_unit(apply_quirk);
+       if (ret)
+               return ret;
+
+       ret = init_rapl_pmus();
+       if (ret)
+               return ret;
+
+       cpu_notifier_register_begin();
+
+       ret = rapl_prepare_cpus();
+       if (ret)
+               goto out;
+
+       ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+       if (ret)
+               goto out;
+
+       __perf_cpu_notifier(rapl_cpu_notifier);
+       cpu_notifier_register_done();
+       rapl_advertise();
+       return 0;
+
+out:
+       pr_warn("Initialization failed (%d), disabled\n", ret);
+       cleanup_rapl_pmus();
+       cpu_notifier_register_done();
+       return ret;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c

new file mode 100644 (file)

index 0000000..7012d18
--- /dev/null
+++ b/arch/x86/events/intel/uncore.c
@@ -0,0 +1,1412 @@
+#include "uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+       EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+       EVENT_CONSTRAINT(0, 0, 0);
+
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+       struct pci2phy_map *map;
+       int phys_id = -1;
+
+       raw_spin_lock(&pci2phy_map_lock);
+       list_for_each_entry(map, &pci2phy_map_head, list) {
+               if (map->segment == pci_domain_nr(bus)) {
+                       phys_id = map->pbus_to_physid[bus->number];
+                       break;
+               }
+       }
+       raw_spin_unlock(&pci2phy_map_lock);
+
+       return phys_id;
+}
+
+static void uncore_free_pcibus_map(void)
+{
+       struct pci2phy_map *map, *tmp;
+
+       list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+               list_del(&map->list);
+               kfree(map);
+       }
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+       struct pci2phy_map *map, *alloc = NULL;
+       int i;
+
+       lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+       list_for_each_entry(map, &pci2phy_map_head, list) {
+               if (map->segment == segment)
+                       goto end;
+       }
+
+       if (!alloc) {
+               raw_spin_unlock(&pci2phy_map_lock);
+               alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+               raw_spin_lock(&pci2phy_map_lock);
+
+               if (!alloc)
+                       return NULL;
+
+               goto lookup;
+       }
+
+       map = alloc;
+       alloc = NULL;
+       map->segment = segment;
+       for (i = 0; i < 256; i++)
+               map->pbus_to_physid[i] = -1;
+       list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+       kfree(alloc);
+       return map;
+}
+
+ssize_t uncore_event_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf)
+{
+       struct uncore_event_desc *event =
+               container_of(attr, struct uncore_event_desc, attr);
+       return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+       return pmu->boxes[topology_logical_package_id(cpu)];
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       u64 count;
+
+       rdmsrl(event->hw.event_base, count);
+
+       return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       unsigned long flags;
+       bool ok = false;
+
+       /*
+        * reg->alloc can be set due to existing state, so for fake box we
+        * need to ignore this, otherwise we might fail to allocate proper
+        * fake state for this extra reg constraint.
+        */
+       if (reg1->idx == EXTRA_REG_NONE ||
+           (!uncore_box_is_fake(box) && reg1->alloc))
+               return NULL;
+
+       er = &box->shared_regs[reg1->idx];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (!atomic_read(&er->ref) ||
+           (er->config1 == reg1->config && er->config2 == reg2->config)) {
+               atomic_inc(&er->ref);
+               er->config1 = reg1->config;
+               er->config2 = reg2->config;
+               ok = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (ok) {
+               if (!uncore_box_is_fake(box))
+                       reg1->alloc = 1;
+               return NULL;
+       }
+
+       return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+       /*
+        * Only put constraint if extra reg was actually allocated. Also
+        * takes care of event which do not use an extra shared reg.
+        *
+        * Also, if this is a fake box we shouldn't touch any event state
+        * (reg->alloc) and we don't care about leaving inconsistent box
+        * state either since it will be thrown out.
+        */
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       er = &box->shared_regs[reg1->idx];
+       atomic_dec(&er->ref);
+       reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       u64 config;
+
+       er = &box->shared_regs[idx];
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       config = er->config;
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+                                  struct perf_event *event, int idx)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = idx;
+       hwc->last_tag = ++box->tags[idx];
+
+       if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+               hwc->event_base = uncore_fixed_ctr(box);
+               hwc->config_base = uncore_fixed_ctl(box);
+               return;
+       }
+
+       hwc->config_base = uncore_event_ctl(box, hwc->idx);
+       hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+       u64 prev_count, new_count, delta;
+       int shift;
+
+       if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+               shift = 64 - uncore_fixed_ctr_bits(box);
+       else
+               shift = 64 - uncore_perf_ctr_bits(box);
+
+       /* the hrtimer might modify the previous event value */
+again:
+       prev_count = local64_read(&event->hw.prev_count);
+       new_count = uncore_read_counter(box, event);
+       if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+               goto again;
+
+       delta = (new_count << shift) - (prev_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+       struct intel_uncore_box *box;
+       struct perf_event *event;
+       unsigned long flags;
+       int bit;
+
+       box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+       if (!box->n_active || box->cpu != smp_processor_id())
+               return HRTIMER_NORESTART;
+       /*
+        * disable local interrupt to prevent uncore_pmu_event_start/stop
+        * to interrupt the update process
+        */
+       local_irq_save(flags);
+
+       /*
+        * handle boxes with an active event list as opposed to active
+        * counters
+        */
+       list_for_each_entry(event, &box->active_list, active_entry) {
+               uncore_perf_event_update(box, event);
+       }
+
+       for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+               uncore_perf_event_update(box, box->events[bit]);
+
+       local_irq_restore(flags);
+
+       hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+       return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+                     HRTIMER_MODE_REL_PINNED);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+                                                int node)
+{
+       int i, size, numshared = type->num_shared_regs ;
+       struct intel_uncore_box *box;
+
+       size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
+
+       box = kzalloc_node(size, GFP_KERNEL, node);
+       if (!box)
+               return NULL;
+
+       for (i = 0; i < numshared; i++)
+               raw_spin_lock_init(&box->shared_regs[i].lock);
+
+       uncore_pmu_init_hrtimer(box);
+       box->cpu = -1;
+       box->pci_phys_id = -1;
+       box->pkgid = -1;
+
+       /* set default hrtimer timeout */
+       box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+       INIT_LIST_HEAD(&box->active_list);
+
+       return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+       return event->pmu->event_init == uncore_pmu_event_init;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+                     bool dogrp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = box->pmu->type->num_counters;
+       if (box->pmu->type->fixed_ctl)
+               max_count++;
+
+       if (box->n_events >= max_count)
+               return -EINVAL;
+
+       n = box->n_events;
+
+       if (is_uncore_event(leader)) {
+               box->event_list[n] = leader;
+               n++;
+       }
+
+       if (!dogrp)
+               return n;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry) {
+               if (!is_uncore_event(event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               box->event_list[n] = event;
+               n++;
+       }
+       return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_type *type = box->pmu->type;
+       struct event_constraint *c;
+
+       if (type->ops->get_constraint) {
+               c = type->ops->get_constraint(box, event);
+               if (c)
+                       return c;
+       }
+
+       if (event->attr.config == UNCORE_FIXED_EVENT)
+               return &uncore_constraint_fixed;
+
+       if (type->constraints) {
+               for_each_event_constraint(c, type->constraints) {
+                       if ((event->hw.config & c->cmask) == c->code)
+                               return c;
+               }
+       }
+
+       return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       if (box->pmu->type->ops->put_constraint)
+               box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+       unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+       struct event_constraint *c;
+       int i, wmin, wmax, ret = 0;
+       struct hw_perf_event *hwc;
+
+       bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+       for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+               c = uncore_get_event_constraint(box, box->event_list[i]);
+               box->event_constraint[i] = c;
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
+       }
+
+       /* fastpath, try to reuse previous register */
+       for (i = 0; i < n; i++) {
+               hwc = &box->event_list[i]->hw;
+               c = box->event_constraint[i];
+
+               /* never assigned */
+               if (hwc->idx == -1)
+                       break;
+
+               /* constraint still honored */
+               if (!test_bit(hwc->idx, c->idxmsk))
+                       break;
+
+               /* not already used */
+               if (test_bit(hwc->idx, used_mask))
+                       break;
+
+               __set_bit(hwc->idx, used_mask);
+               if (assign)
+                       assign[i] = hwc->idx;
+       }
+       /* slow path */
+       if (i != n)
+               ret = perf_assign_events(box->event_constraint, n,
+                                        wmin, wmax, n, assign);
+
+       if (!assign || ret) {
+               for (i = 0; i < n; i++)
+                       uncore_put_event_constraint(box, box->event_list[i]);
+       }
+       return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int idx = event->hw.idx;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+               return;
+
+       event->hw.state = 0;
+       box->events[idx] = event;
+       box->n_active++;
+       __set_bit(idx, box->active_mask);
+
+       local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+       uncore_enable_event(box, event);
+
+       if (box->n_active == 1) {
+               uncore_enable_box(box);
+               uncore_pmu_start_hrtimer(box);
+       }
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+               uncore_disable_event(box, event);
+               box->n_active--;
+               box->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+
+               if (box->n_active == 0) {
+                       uncore_disable_box(box);
+                       uncore_pmu_cancel_hrtimer(box);
+               }
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               uncore_perf_event_update(box, event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+       int assign[UNCORE_PMC_IDX_MAX];
+       int i, n, ret;
+
+       if (!box)
+               return -ENODEV;
+
+       ret = n = uncore_collect_events(box, event, false);
+       if (ret < 0)
+               return ret;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       ret = uncore_assign_events(box, assign, n);
+       if (ret)
+               return ret;
+
+       /* save events moving to new counters */
+       for (i = 0; i < box->n_events; i++) {
+               event = box->event_list[i];
+               hwc = &event->hw;
+
+               if (hwc->idx == assign[i] &&
+                       hwc->last_tag == box->tags[assign[i]])
+                       continue;
+               /*
+                * Ensure we don't accidentally enable a stopped
+                * counter simply because we rescheduled.
+                */
+               if (hwc->state & PERF_HES_STOPPED)
+                       hwc->state |= PERF_HES_ARCH;
+
+               uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+       }
+
+       /* reprogram moved events into new counters */
+       for (i = 0; i < n; i++) {
+               event = box->event_list[i];
+               hwc = &event->hw;
+
+               if (hwc->idx != assign[i] ||
+                       hwc->last_tag != box->tags[assign[i]])
+                       uncore_assign_hw_event(box, event, assign[i]);
+               else if (i < box->n_events)
+                       continue;
+
+               if (hwc->state & PERF_HES_ARCH)
+                       continue;
+
+               uncore_pmu_event_start(event, 0);
+       }
+       box->n_events = n;
+
+       return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int i;
+
+       uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < box->n_events; i++) {
+               if (event == box->event_list[i]) {
+                       uncore_put_event_constraint(box, event);
+
+                       for (++i; i < box->n_events; i++)
+                               box->event_list[i - 1] = box->event_list[i];
+
+                       --box->n_events;
+                       break;
+               }
+       }
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+                               struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct intel_uncore_box *fake_box;
+       int ret = -EINVAL, n;
+
+       fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+       if (!fake_box)
+               return -ENOMEM;
+
+       fake_box->pmu = pmu;
+       /*
+        * the event is not yet connected with its
+        * siblings therefore we must first collect
+        * existing siblings, then add the new event
+        * before we can simulate the scheduling
+        */
+       n = uncore_collect_events(fake_box, leader, true);
+       if (n < 0)
+               goto out;
+
+       fake_box->n_events = n;
+       n = uncore_collect_events(fake_box, event, false);
+       if (n < 0)
+               goto out;
+
+       fake_box->n_events = n;
+
+       ret = uncore_assign_events(fake_box, NULL, n);
+out:
+       kfree(fake_box);
+       return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       struct hw_perf_event *hwc = &event->hw;
+       int ret;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       pmu = uncore_event_to_pmu(event);
+       /* no device found for this pmu */
+       if (pmu->func_id < 0)
+               return -ENOENT;
+
+       /*
+        * Uncore PMU does measure at all privilege level all the time.
+        * So it doesn't make sense to specify any exclude bits.
+        */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+                       event->attr.exclude_hv || event->attr.exclude_idle)
+               return -EINVAL;
+
+       /* Sampling not supported yet */
+       if (hwc->sample_period)
+               return -EINVAL;
+
+       /*
+        * Place all uncore events for a particular physical package
+        * onto a single cpu
+        */
+       if (event->cpu < 0)
+               return -EINVAL;
+       box = uncore_pmu_to_box(pmu, event->cpu);
+       if (!box || box->cpu < 0)
+               return -EINVAL;
+       event->cpu = box->cpu;
+       event->pmu_private = box;
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+       if (event->attr.config == UNCORE_FIXED_EVENT) {
+               /* no fixed counter */
+               if (!pmu->type->fixed_ctl)
+                       return -EINVAL;
+               /*
+                * if there is only one fixed counter, only the first pmu
+                * can access the fixed counter
+                */
+               if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+                       return -EINVAL;
+
+               /* fixed counters have event field hardcoded to zero */
+               hwc->config = 0ULL;
+       } else {
+               hwc->config = event->attr.config & pmu->type->event_mask;
+               if (pmu->type->ops->hw_config) {
+                       ret = pmu->type->ops->hw_config(box, event);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       if (event->group_leader != event)
+               ret = uncore_validate_group(pmu, event);
+       else
+               ret = 0;
+
+       return ret;
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+       .attrs = uncore_pmu_attrs,
+};
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+       int ret;
+
+       if (!pmu->type->pmu) {
+               pmu->pmu = (struct pmu) {
+                       .attr_groups    = pmu->type->attr_groups,
+                       .task_ctx_nr    = perf_invalid_context,
+                       .event_init     = uncore_pmu_event_init,
+                       .add            = uncore_pmu_event_add,
+                       .del            = uncore_pmu_event_del,
+                       .start          = uncore_pmu_event_start,
+                       .stop           = uncore_pmu_event_stop,
+                       .read           = uncore_pmu_event_read,
+               };
+       } else {
+               pmu->pmu = *pmu->type->pmu;
+               pmu->pmu.attr_groups = pmu->type->attr_groups;
+       }
+
+       if (pmu->type->num_boxes == 1) {
+               if (strlen(pmu->type->name) > 0)
+                       sprintf(pmu->name, "uncore_%s", pmu->type->name);
+               else
+                       sprintf(pmu->name, "uncore");
+       } else {
+               sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+                       pmu->pmu_idx);
+       }
+
+       ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+       if (!ret)
+               pmu->registered = true;
+       return ret;
+}
+
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+       if (!pmu->registered)
+               return;
+       perf_pmu_unregister(&pmu->pmu);
+       pmu->registered = false;
+}
+
+static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       if (pmu) {
+               pkg = topology_physical_package_id(cpu);
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (box)
+                               uncore_box_exit(box);
+               }
+       }
+}
+
+static void __init uncore_exit_boxes(void *dummy)
+{
+       struct intel_uncore_type **types;
+
+       for (types = uncore_msr_uncores; *types; types++)
+               __uncore_exit_boxes(*types++, smp_processor_id());
+}
+
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+       int pkg;
+
+       for (pkg = 0; pkg < max_packages; pkg++)
+               kfree(pmu->boxes[pkg]);
+       kfree(pmu->boxes);
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       int i;
+
+       if (pmu) {
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       uncore_pmu_unregister(pmu);
+                       uncore_free_boxes(pmu);
+               }
+               kfree(type->pmus);
+               type->pmus = NULL;
+       }
+       kfree(type->events_group);
+       type->events_group = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+       for (; *types; types++)
+               uncore_type_exit(*types);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
+{
+       struct intel_uncore_pmu *pmus;
+       struct attribute_group *attr_group;
+       struct attribute **attrs;
+       size_t size;
+       int i, j;
+
+       pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+       if (!pmus)
+               return -ENOMEM;
+
+       size = max_packages * sizeof(struct intel_uncore_box *);
+
+       for (i = 0; i < type->num_boxes; i++) {
+               pmus[i].func_id = setid ? i : -1;
+               pmus[i].pmu_idx = i;
+               pmus[i].type    = type;
+               pmus[i].boxes   = kzalloc(size, GFP_KERNEL);
+               if (!pmus[i].boxes)
+                       return -ENOMEM;
+       }
+
+       type->pmus = pmus;
+       type->unconstrainted = (struct event_constraint)
+               __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+                               0, type->num_counters, 0, 0);
+
+       if (type->event_descs) {
+               for (i = 0; type->event_descs[i].attr.attr.name; i++);
+
+               attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+                                       sizeof(*attr_group), GFP_KERNEL);
+               if (!attr_group)
+                       return -ENOMEM;
+
+               attrs = (struct attribute **)(attr_group + 1);
+               attr_group->name = "events";
+               attr_group->attrs = attrs;
+
+               for (j = 0; j < i; j++)
+                       attrs[j] = &type->event_descs[j].attr.attr;
+
+               type->events_group = attr_group;
+       }
+
+       type->pmu_group = &uncore_pmu_attr_group;
+       return 0;
+}
+
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
+{
+       int ret;
+
+       for (; *types; types++) {
+               ret = uncore_type_init(*types, setid);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct intel_uncore_type *type;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int phys_id, pkg, ret;
+
+       phys_id = uncore_pcibus_to_physid(pdev->bus);
+       if (phys_id < 0)
+               return -ENODEV;
+
+       pkg = topology_phys_to_logical_pkg(phys_id);
+       if (WARN_ON_ONCE(pkg < 0))
+               return -EINVAL;
+
+       if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+               int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+
+               uncore_extra_pci_dev[pkg].dev[idx] = pdev;
+               pci_set_drvdata(pdev, NULL);
+               return 0;
+       }
+
+       type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+       /*
+        * for performance monitoring unit with multiple boxes,
+        * each box has a different function id.
+        */
+       pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+       /* Knights Landing uses a common PCI device ID for multiple instances of
+        * an uncore PMU device type. There is only one entry per device type in
+        * the knl_uncore_pci_ids table inspite of multiple devices present for
+        * some device types. Hence PCI device idx would be 0 for all devices.
+        * So increment pmu pointer to point to an unused array element.
+        */
+       if (boot_cpu_data.x86_model == 87) {
+               while (pmu->func_id >= 0)
+                       pmu++;
+       }
+
+       if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+               return -EINVAL;
+
+       box = uncore_alloc_box(type, NUMA_NO_NODE);
+       if (!box)
+               return -ENOMEM;
+
+       if (pmu->func_id < 0)
+               pmu->func_id = pdev->devfn;
+       else
+               WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+       atomic_inc(&box->refcnt);
+       box->pci_phys_id = phys_id;
+       box->pkgid = pkg;
+       box->pci_dev = pdev;
+       box->pmu = pmu;
+       uncore_box_init(box);
+       pci_set_drvdata(pdev, box);
+
+       pmu->boxes[pkg] = box;
+       if (atomic_inc_return(&pmu->activeboxes) > 1)
+               return 0;
+
+       /* First active box registers the pmu */
+       ret = uncore_pmu_register(pmu);
+       if (ret) {
+               pci_set_drvdata(pdev, NULL);
+               pmu->boxes[pkg] = NULL;
+               uncore_box_exit(box);
+               kfree(box);
+       }
+       return ret;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+       struct intel_uncore_box *box = pci_get_drvdata(pdev);
+       struct intel_uncore_pmu *pmu;
+       int i, phys_id, pkg;
+
+       phys_id = uncore_pcibus_to_physid(pdev->bus);
+       pkg = topology_phys_to_logical_pkg(phys_id);
+
+       box = pci_get_drvdata(pdev);
+       if (!box) {
+               for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+                       if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+                               uncore_extra_pci_dev[pkg].dev[i] = NULL;
+                               break;
+                       }
+               }
+               WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+               return;
+       }
+
+       pmu = box->pmu;
+       if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
+               return;
+
+       pci_set_drvdata(pdev, NULL);
+       pmu->boxes[pkg] = NULL;
+       if (atomic_dec_return(&pmu->activeboxes) == 0)
+               uncore_pmu_unregister(pmu);
+       uncore_box_exit(box);
+       kfree(box);
+}
+
+static int __init uncore_pci_init(void)
+{
+       size_t size;
+       int ret;
+
+       switch (boot_cpu_data.x86_model) {
+       case 45: /* Sandy Bridge-EP */
+               ret = snbep_uncore_pci_init();
+               break;
+       case 62: /* Ivy Bridge-EP */
+               ret = ivbep_uncore_pci_init();
+               break;
+       case 63: /* Haswell-EP */
+               ret = hswep_uncore_pci_init();
+               break;
+       case 79: /* BDX-EP */
+       case 86: /* BDX-DE */
+               ret = bdx_uncore_pci_init();
+               break;
+       case 42: /* Sandy Bridge */
+               ret = snb_uncore_pci_init();
+               break;
+       case 58: /* Ivy Bridge */
+               ret = ivb_uncore_pci_init();
+               break;
+       case 60: /* Haswell */
+       case 69: /* Haswell Celeron */
+               ret = hsw_uncore_pci_init();
+               break;
+       case 61: /* Broadwell */
+               ret = bdw_uncore_pci_init();
+               break;
+       case 87: /* Knights Landing */
+               ret = knl_uncore_pci_init();
+               break;
+       case 94: /* SkyLake */
+               ret = skl_uncore_pci_init();
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       if (ret)
+               return ret;
+
+       size = max_packages * sizeof(struct pci_extra_dev);
+       uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+       if (!uncore_extra_pci_dev) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = uncore_types_init(uncore_pci_uncores, false);
+       if (ret)
+               goto errtype;
+
+       uncore_pci_driver->probe = uncore_pci_probe;
+       uncore_pci_driver->remove = uncore_pci_remove;
+
+       ret = pci_register_driver(uncore_pci_driver);
+       if (ret)
+               goto errtype;
+
+       pcidrv_registered = true;
+       return 0;
+
+errtype:
+       uncore_types_exit(uncore_pci_uncores);
+       kfree(uncore_extra_pci_dev);
+       uncore_extra_pci_dev = NULL;
+       uncore_free_pcibus_map();
+err:
+       uncore_pci_uncores = empty_uncore;
+       return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+       if (pcidrv_registered) {
+               pcidrv_registered = false;
+               pci_unregister_driver(uncore_pci_driver);
+               uncore_types_exit(uncore_pci_uncores);
+               kfree(uncore_extra_pci_dev);
+               uncore_free_pcibus_map();
+       }
+}
+
+static void uncore_cpu_dying(int cpu)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (box && atomic_dec_return(&box->refcnt) == 0)
+                               uncore_box_exit(box);
+               }
+       }
+}
+
+static void uncore_cpu_starting(int cpu, bool init)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg, ncpus = 1;
+
+       if (init) {
+               /*
+                * On init we get the number of online cpus in the package
+                * and set refcount for all of them.
+                */
+               ncpus = cpumask_weight(topology_core_cpumask(cpu));
+       }
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (!box)
+                               continue;
+                       /* The first cpu on a package activates the box */
+                       if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
+                               uncore_box_init(box);
+               }
+       }
+}
+
+static int uncore_cpu_prepare(int cpu)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       if (pmu->boxes[pkg])
+                               continue;
+                       /* First cpu of a package allocates the box */
+                       box = uncore_alloc_box(type, cpu_to_node(cpu));
+                       if (!box)
+                               return -ENOMEM;
+                       box->pmu = pmu;
+                       box->pkgid = pkg;
+                       pmu->boxes[pkg] = box;
+               }
+       }
+       return 0;
+}
+
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+                                  int new_cpu)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
+       for (i = 0; i < type->num_boxes; i++, pmu++) {
+               box = pmu->boxes[pkg];
+               if (!box)
+                       continue;
+
+               if (old_cpu < 0) {
+                       WARN_ON_ONCE(box->cpu != -1);
+                       box->cpu = new_cpu;
+                       continue;
+               }
+
+               WARN_ON_ONCE(box->cpu != old_cpu);
+               box->cpu = -1;
+               if (new_cpu < 0)
+                       continue;
+
+               uncore_pmu_cancel_hrtimer(box);
+               perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+               box->cpu = new_cpu;
+       }
+}
+
+static void uncore_change_context(struct intel_uncore_type **uncores,
+                                 int old_cpu, int new_cpu)
+{
+       for (; *uncores; uncores++)
+               uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
+static void uncore_event_exit_cpu(int cpu)
+{
+       int target;
+
+       /* Check if exiting cpu is used for collecting uncore events */
+       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+               return;
+
+       /* Find a new cpu to collect uncore events */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       /* Migrate uncore events to the new target */
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &uncore_cpu_mask);
+       else
+               target = -1;
+
+       uncore_change_context(uncore_msr_uncores, cpu, target);
+       uncore_change_context(uncore_pci_uncores, cpu, target);
+}
+
+static void uncore_event_init_cpu(int cpu)
+{
+       int target;
+
+       /*
+        * Check if there is an online cpu in the package
+        * which collects uncore events already.
+        */
+       target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+       if (target < nr_cpu_ids)
+               return;
+
+       cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+       uncore_change_context(uncore_msr_uncores, -1, cpu);
+       uncore_change_context(uncore_pci_uncores, -1, cpu);
+}
+
+static int uncore_cpu_notifier(struct notifier_block *self,
+                              unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               return notifier_from_errno(uncore_cpu_prepare(cpu));
+
+       case CPU_STARTING:
+               uncore_cpu_starting(cpu, false);
+       case CPU_DOWN_FAILED:
+               uncore_event_init_cpu(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DYING:
+               uncore_cpu_dying(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               uncore_event_exit_cpu(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb = {
+       .notifier_call  = uncore_cpu_notifier,
+       /*
+        * to migrate uncore events, our notifier should be executed
+        * before perf core's notifier.
+        */
+       .priority       = CPU_PRI_PERF + 1,
+};
+
+static int __init type_pmu_register(struct intel_uncore_type *type)
+{
+       int i, ret;
+
+       for (i = 0; i < type->num_boxes; i++) {
+               ret = uncore_pmu_register(&type->pmus[i]);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+       struct intel_uncore_type **types = uncore_msr_uncores;
+       int ret;
+
+       for (; *types; types++) {
+               ret = type_pmu_register(*types);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int __init uncore_cpu_init(void)
+{
+       int ret;
+
+       switch (boot_cpu_data.x86_model) {
+       case 26: /* Nehalem */
+       case 30:
+       case 37: /* Westmere */
+       case 44:
+               nhm_uncore_cpu_init();
+               break;
+       case 42: /* Sandy Bridge */
+       case 58: /* Ivy Bridge */
+       case 60: /* Haswell */
+       case 69: /* Haswell */
+       case 70: /* Haswell */
+       case 61: /* Broadwell */
+       case 71: /* Broadwell */
+               snb_uncore_cpu_init();
+               break;
+       case 45: /* Sandy Bridge-EP */
+               snbep_uncore_cpu_init();
+               break;
+       case 46: /* Nehalem-EX */
+       case 47: /* Westmere-EX aka. Xeon E7 */
+               nhmex_uncore_cpu_init();
+               break;
+       case 62: /* Ivy Bridge-EP */
+               ivbep_uncore_cpu_init();
+               break;
+       case 63: /* Haswell-EP */
+               hswep_uncore_cpu_init();
+               break;
+       case 79: /* BDX-EP */
+       case 86: /* BDX-DE */
+               bdx_uncore_cpu_init();
+               break;
+       case 87: /* Knights Landing */
+               knl_uncore_cpu_init();
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       ret = uncore_types_init(uncore_msr_uncores, true);
+       if (ret)
+               goto err;
+
+       ret = uncore_msr_pmus_register();
+       if (ret)
+               goto err;
+       return 0;
+err:
+       uncore_types_exit(uncore_msr_uncores);
+       uncore_msr_uncores = empty_uncore;
+       return ret;
+}
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+       uncore_cpu_starting(smp_processor_id(), true);
+}
+
+/* Lazy to avoid allocation of a few bytes for the normal case */
+static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
+
+static int __init uncore_cpumask_init(bool msr)
+{
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu) {
+               unsigned int pkg = topology_logical_package_id(cpu);
+               int ret;
+
+               if (test_and_set_bit(pkg, packages))
+                       continue;
+               /*
+                * The first online cpu of each package allocates and takes
+                * the refcounts for all other online cpus in that package.
+                * If msrs are not enabled no allocation is required.
+                */
+               if (msr) {
+                       ret = uncore_cpu_prepare(cpu);
+                       if (ret)
+                               return ret;
+               }
+               uncore_event_init_cpu(cpu);
+               smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
+       }
+       __register_cpu_notifier(&uncore_cpu_nb);
+       return 0;
+}
+
+static int __init intel_uncore_init(void)
+{
+       int pret, cret, ret;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return -ENODEV;
+
+       if (cpu_has_hypervisor)
+               return -ENODEV;
+
+       max_packages = topology_max_packages();
+
+       pret = uncore_pci_init();
+       cret = uncore_cpu_init();
+
+       if (cret && pret)
+               return -ENODEV;
+
+       cpu_notifier_register_begin();
+       ret = uncore_cpumask_init(!cret);
+       if (ret)
+               goto err;
+       cpu_notifier_register_done();
+       return 0;
+
+err:
+       /* Undo box->init_box() */
+       on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
+       uncore_types_exit(uncore_msr_uncores);
+       uncore_pci_exit();
+       cpu_notifier_register_done();
+       return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h

new file mode 100644 (file)

index 0000000..79766b9
--- /dev/null
+++ b/arch/x86/events/intel/uncore.h
@@ -0,0 +1,378 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <asm/apicdef.h>
+
+#include <linux/perf_event.h>
+#include "../perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN            32
+#define UNCORE_PMU_HRTIMER_INTERVAL    (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT             0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC     8
+#define UNCORE_PMC_IDX_FIXED           UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX             (UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data)      ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data)       (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV           0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX       3
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+struct pci_extra_dev {
+       struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+       const char *name;
+       int num_counters;
+       int num_boxes;
+       int perf_ctr_bits;
+       int fixed_ctr_bits;
+       unsigned perf_ctr;
+       unsigned event_ctl;
+       unsigned event_mask;
+       unsigned fixed_ctr;
+       unsigned fixed_ctl;
+       unsigned box_ctl;
+       unsigned msr_offset;
+       unsigned num_shared_regs:8;
+       unsigned single_fixed:1;
+       unsigned pair_ctr_ctl:1;
+       unsigned *msr_offsets;
+       struct event_constraint unconstrainted;
+       struct event_constraint *constraints;
+       struct intel_uncore_pmu *pmus;
+       struct intel_uncore_ops *ops;
+       struct uncore_event_desc *event_descs;
+       const struct attribute_group *attr_groups[4];
+       struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+       void (*init_box)(struct intel_uncore_box *);
+       void (*exit_box)(struct intel_uncore_box *);
+       void (*disable_box)(struct intel_uncore_box *);
+       void (*enable_box)(struct intel_uncore_box *);
+       void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+       void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+       u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+       int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+       struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+                                                  struct perf_event *);
+       void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+       struct pmu                      pmu;
+       char                            name[UNCORE_PMU_NAME_LEN];
+       int                             pmu_idx;
+       int                             func_id;
+       bool                            registered;
+       atomic_t                        activeboxes;
+       struct intel_uncore_type        *type;
+       struct intel_uncore_box         **boxes;
+};
+
+struct intel_uncore_extra_reg {
+       raw_spinlock_t lock;
+       u64 config, config1, config2;
+       atomic_t ref;
+};
+
+struct intel_uncore_box {
+       int pci_phys_id;
+       int pkgid;
+       int n_active;   /* number of active events */
+       int n_events;
+       int cpu;        /* cpu to collect events */
+       unsigned long flags;
+       atomic_t refcnt;
+       struct perf_event *events[UNCORE_PMC_IDX_MAX];
+       struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+       unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+       u64 tags[UNCORE_PMC_IDX_MAX];
+       struct pci_dev *pci_dev;
+       struct intel_uncore_pmu *pmu;
+       u64 hrtimer_duration; /* hrtimer timeout for this box */
+       struct hrtimer hrtimer;
+       struct list_head list;
+       struct list_head active_list;
+       void *io_addr;
+       struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED      0
+
+struct uncore_event_desc {
+       struct kobj_attribute attr;
+       const char *config;
+};
+
+struct pci2phy_map {
+       struct list_head list;
+       int segment;
+       int pbus_to_physid[256];
+};
+
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
+ssize_t uncore_event_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf);
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config)                        \
+{                                                              \
+       .attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
+       .config = _config,                                      \
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)                        \
+static ssize_t __uncore_##_var##_show(struct kobject *kobj,            \
+                               struct kobj_attribute *attr,            \
+                               char *page)                             \
+{                                                                      \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
+       return sprintf(page, _format "\n");                             \
+}                                                                      \
+static struct kobj_attribute format_attr_##_var =                      \
+       __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+       return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+       struct intel_uncore_pmu *pmu = box->pmu;
+       return pmu->type->msr_offsets ?
+               pmu->type->msr_offsets[pmu->pmu_idx] :
+               pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+       if (!box->pmu->type->box_ctl)
+               return 0;
+       return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+       if (!box->pmu->type->fixed_ctl)
+               return 0;
+       return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       return box->pmu->type->event_ctl +
+               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+               uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       return box->pmu->type->perf_ctr +
+               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+               uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+       if (box->pci_dev)
+               return uncore_pci_fixed_ctl(box);
+       else
+               return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+       if (box->pci_dev)
+               return uncore_pci_fixed_ctr(box);
+       else
+               return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       if (box->pci_dev)
+               return uncore_pci_event_ctl(box, idx);
+       else
+               return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       if (box->pci_dev)
+               return uncore_pci_perf_ctr(box, idx);
+       else
+               return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+       return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+       return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->type->ops->disable_box)
+               box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->type->ops->enable_box)
+               box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->init_box)
+                       box->pmu->type->ops->init_box(box);
+       }
+}
+
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+       if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->exit_box)
+                       box->pmu->type->ops->exit_box(box);
+       }
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+       return (box->pkgid < 0);
+}
+
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+       return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+       return event->pmu_private;
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_extra_dev *uncore_extra_pci_dev;
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c

new file mode 100644 (file)

index 0000000..cda5693
--- /dev/null
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -0,0 +1,1227 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK     0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK      0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0         (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET                (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN          (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22                (1 << 22)
+#define NHMEX_PMON_CTL_INVERT          (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK      0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK      (NHMEX_PMON_CTL_EV_SEL_MASK | \
+                                        NHMEX_PMON_CTL_UMASK_MASK | \
+                                        NHMEX_PMON_CTL_EDGE_DET | \
+                                        NHMEX_PMON_CTL_INVERT | \
+                                        NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL            0xc00
+#define NHMEX_U_MSR_PMON_CTR                   0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL                        0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN                 (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL       0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL             (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL            (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK            \
+               (NHMEX_PMON_CTL_EV_SEL_MASK |   \
+                NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL           0xd00
+#define NHMEX_C0_MSR_PMON_CTR0                 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0              0xd10
+#define NHMEX_C_MSR_OFFSET                     0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL           0xc20
+#define NHMEX_B0_MSR_PMON_CTR0                 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0                 0xc30
+#define NHMEX_B_MSR_OFFSET                     0x40
+#define NHMEX_B0_MSR_MATCH                     0xe45
+#define NHMEX_B0_MSR_MASK                      0xe46
+#define NHMEX_B1_MSR_MATCH                     0xe4d
+#define NHMEX_B1_MSR_MASK                      0xe4e
+
+#define NHMEX_B_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT          1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK           \
+               (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT         6
+#define NHMEX_B_PMON_CTR_MASK          \
+               (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK            \
+               (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+                NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL           0xc40
+#define NHMEX_S0_MSR_PMON_CTR0                 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0                 0xc50
+#define NHMEX_S_MSR_OFFSET                     0x80
+#define NHMEX_S0_MSR_MM_CFG                    0xe48
+#define NHMEX_S0_MSR_MATCH                     0xe49
+#define NHMEX_S0_MSR_MASK                      0xe4a
+#define NHMEX_S1_MSR_MM_CFG                    0xe58
+#define NHMEX_S1_MSR_MATCH                     0xe59
+#define NHMEX_S1_MSR_MASK                      0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN                 (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV             0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL                        0xca0
+#define NHMEX_M0_MSR_PMU_DSP                   0xca5
+#define NHMEX_M0_MSR_PMU_ISS                   0xca6
+#define NHMEX_M0_MSR_PMU_MAP                   0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR               0xca8
+#define NHMEX_M0_MSR_PMU_PGT                   0xca9
+#define NHMEX_M0_MSR_PMU_PLD                   0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC           0xcab
+#define NHMEX_M0_MSR_PMU_CTL0                  0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0                  0xcb1
+#define NHMEX_M_MSR_OFFSET                     0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG                        0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG                        0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN                 (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK           0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK            0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT           34
+
+#define NHMEX_M_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN                        (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT      2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK       \
+       (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT    4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK     \
+       (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE             (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE             (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT         9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK          \
+       (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT    19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK     \
+       (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK                    \
+               (NHMEX_M_PMON_CTL_COUNT_MODE_MASK |     \
+                NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |   \
+                NHMEX_M_PMON_CTL_WRAP_MODE |           \
+                NHMEX_M_PMON_CTL_FLAG_MODE |           \
+                NHMEX_M_PMON_CTL_INC_SEL_MASK |        \
+                NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+                               NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+                          NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+                               NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+               EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+                               MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+               EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+                               MBOX_SET_FLAG_SEL_MASK, \
+                               (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL                 0xe00
+#define NHMEX_R_MSR_PMON_CTL0                  0xe10
+#define NHMEX_R_MSR_PMON_CNT0                  0xe11
+#define NHMEX_R_MSR_OFFSET                     0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)           \
+               ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)                (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)                (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)                \
+               (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)   \
+               (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)    \
+               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)     \
+               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)   \
+               (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)    \
+               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)     \
+               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT          1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK           \
+               (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN                        (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK            NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL                 0xc80
+#define NHMEX_W_MSR_PMON_CNT0                  0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0              0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR             0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL             0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN           (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+                               ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       u64 config;
+
+       if (msr) {
+               rdmsrl(msr, config);
+               config &= ~((1ULL << uncore_num_counters(box)) - 1);
+               /* WBox has a fixed counter */
+               if (uncore_msr_fixed_ctl(box))
+                       config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+               wrmsrl(msr, config);
+       }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       u64 config;
+
+       if (msr) {
+               rdmsrl(msr, config);
+               config |= (1ULL << uncore_num_counters(box)) - 1;
+               /* WBox has a fixed counter */
+               if (uncore_msr_fixed_ctl(box))
+                       config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+               wrmsrl(msr, config);
+       }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+       else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+       else
+               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()                         \
+       .init_box       = nhmex_uncore_msr_init_box,            \
+       .exit_box       = nhmex_uncore_msr_exit_box,            \
+       .disable_box    = nhmex_uncore_msr_disable_box,         \
+       .enable_box     = nhmex_uncore_msr_enable_box,          \
+       .disable_event  = nhmex_uncore_msr_disable_event,       \
+       .read_counter   = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event   = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_edge.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+       .name           = "format",
+       .attrs          = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 1,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .event_ctl      = NHMEX_U_MSR_PMON_EV_SEL,
+       .perf_ctr       = NHMEX_U_MSR_PMON_CTR,
+       .event_mask     = NHMEX_U_PMON_RAW_EVENT_MASK,
+       .box_ctl        = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+       .ops            = &nhmex_uncore_ops,
+       .format_group   = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+       0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 6,
+       .num_boxes              = 10,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_C0_MSR_PMON_EV_SEL0,
+       .perf_ctr               = NHMEX_C0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+       .msr_offsets            = nhmex_cbox_msr_offsets,
+       .pair_ctr_ctl           = 1,
+       .ops                    = &nhmex_uncore_ops,
+       .format_group           = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+       .name                   = "wbox",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_W_MSR_PMON_CNT0,
+       .perf_ctr               = NHMEX_W_MSR_PMON_EVT_SEL0,
+       .fixed_ctr              = NHMEX_W_MSR_PMON_FIXED_CTR,
+       .fixed_ctl              = NHMEX_W_MSR_PMON_FIXED_CTL,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_W_MSR_GLOBAL_CTL,
+       .pair_ctr_ctl           = 1,
+       .event_descs            = nhmex_uncore_wbox_events,
+       .ops                    = &nhmex_uncore_ops,
+       .format_group           = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int ctr, ev_sel;
+
+       ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+               NHMEX_B_PMON_CTR_SHIFT;
+       ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+                 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+       /* events that do not use the match/mask registers */
+       if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+           (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+               return 0;
+
+       if (box->pmu->pmu_idx == 0)
+               reg1->reg = NHMEX_B0_MSR_MATCH;
+       else
+               reg1->reg = NHMEX_B1_MSR_MATCH;
+       reg1->idx = 0;
+       reg1->config = event->attr.config1;
+       reg2->config = event->attr.config2;
+       return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg1->reg, reg1->config);
+               wrmsrl(reg1->reg + 1, reg2->config);
+       }
+       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+               (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+       EVENT_CONSTRAINT(0 , 1, 0xc0),
+       EVENT_CONSTRAINT(0x40, 2, 0xc0),
+       EVENT_CONSTRAINT(0x80, 4, 0xc0),
+       EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+       EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+       &format_attr_event5.attr,
+       &format_attr_counter.attr,
+       &format_attr_match.attr,
+       &format_attr_mask.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_bbox_msr_enable_event,
+       .hw_config              = nhmex_bbox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+       .name                   = "bbox",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_B0_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_B0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_B_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+       .msr_offset             = NHMEX_B_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 1,
+       .constraints            = nhmex_uncore_bbox_constraints,
+       .ops                    = &nhmex_uncore_bbox_ops,
+       .format_group           = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       /* only TO_R_PROG_EV event uses the match/mask register */
+       if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+           NHMEX_S_EVENT_TO_R_PROG_EV)
+               return 0;
+
+       if (box->pmu->pmu_idx == 0)
+               reg1->reg = NHMEX_S0_MSR_MM_CFG;
+       else
+               reg1->reg = NHMEX_S1_MSR_MM_CFG;
+       reg1->idx = 0;
+       reg1->config = event->attr.config1;
+       reg2->config = event->attr.config2;
+       return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg1->reg, 0);
+               wrmsrl(reg1->reg + 1, reg1->config);
+               wrmsrl(reg1->reg + 2, reg2->config);
+               wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+       }
+       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match.attr,
+       &format_attr_mask.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+       .name                   = "format",
+       .attrs                  = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_sbox_msr_enable_event,
+       .hw_config              = nhmex_sbox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_S0_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_S0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+       .msr_offset             = NHMEX_S_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 1,
+       .ops                    = &nhmex_uncore_sbox_ops,
+       .format_group           = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+       EXTRA_REG_NHMEX_M_FILTER,
+       EXTRA_REG_NHMEX_M_DSP,
+       EXTRA_REG_NHMEX_M_ISS,
+       EXTRA_REG_NHMEX_M_MAP,
+       EXTRA_REG_NHMEX_M_MSC_THR,
+       EXTRA_REG_NHMEX_M_PGT,
+       EXTRA_REG_NHMEX_M_PLD,
+       EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+       MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+       MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+       MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+       MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+       /* event 0xa uses two extra registers */
+       MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+       MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+       MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+       /* events 0xd ~ 0x10 use the same extra register */
+       MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+       EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       bool ret = false;
+       u64 mask;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               er = &box->shared_regs[idx];
+               raw_spin_lock_irqsave(&er->lock, flags);
+               if (!atomic_read(&er->ref) || er->config == config) {
+                       atomic_inc(&er->ref);
+                       er->config = config;
+                       ret = true;
+               }
+               raw_spin_unlock_irqrestore(&er->lock, flags);
+
+               return ret;
+       }
+       /*
+        * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+        * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+        * fields which are shared.
+        */
+       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       if (WARN_ON_ONCE(idx >= 4))
+               return false;
+
+       /* mask of the shared fields */
+       if (uncore_nhmex)
+               mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+       else
+               mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       /* add mask of the non-shared field if it's in use */
+       if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+               if (uncore_nhmex)
+                       mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               else
+                       mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       }
+
+       if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+               atomic_add(1 << (idx * 8), &er->ref);
+               if (uncore_nhmex)
+                       mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+                               NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               else
+                       mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+                               WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               er->config &= ~mask;
+               er->config |= (config & mask);
+               ret = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               er = &box->shared_regs[idx];
+               atomic_dec(&er->ref);
+               return;
+       }
+
+       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+       atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+       u64 config = reg1->config;
+
+       /* get the non-shared control bits and shift them */
+       idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       if (uncore_nhmex)
+               config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       else
+               config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       if (new_idx > orig_idx) {
+               idx = new_idx - orig_idx;
+               config <<= 3 * idx;
+       } else {
+               idx = orig_idx - new_idx;
+               config >>= 3 * idx;
+       }
+
+       /* add the shared control bits back */
+       if (uncore_nhmex)
+               config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       else
+               config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       if (modify) {
+               /* adjust the main event selector */
+               if (new_idx > orig_idx)
+                       hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+               else
+                       hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+               reg1->config = config;
+               reg1->idx = ~0xff | new_idx;
+       }
+       return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       int i, idx[2], alloc = 0;
+       u64 config1 = reg1->config;
+
+       idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+       idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+       for (i = 0; i < 2; i++) {
+               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+                       idx[i] = 0xff;
+
+               if (idx[i] == 0xff)
+                       continue;
+
+               if (!nhmex_mbox_get_shared_reg(box, idx[i],
+                               __BITS_VALUE(config1, i, 32)))
+                       goto fail;
+               alloc |= (0x1 << i);
+       }
+
+       /* for the match/mask registers */
+       if (reg2->idx != EXTRA_REG_NONE &&
+           (uncore_box_is_fake(box) || !reg2->alloc) &&
+           !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+               goto fail;
+
+       /*
+        * If it's a fake box -- as per validate_{group,event}() we
+        * shouldn't touch event state and we can avoid doing so
+        * since both will only call get_event_constraints() once
+        * on each event, this avoids the need for reg->alloc.
+        */
+       if (!uncore_box_is_fake(box)) {
+               if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+                       nhmex_mbox_alter_er(event, idx[0], true);
+               reg1->alloc |= alloc;
+               if (reg2->idx != EXTRA_REG_NONE)
+                       reg2->alloc = 1;
+       }
+       return NULL;
+fail:
+       if (idx[0] != 0xff && !(alloc & 0x1) &&
+           idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               /*
+                * events 0xd ~ 0x10 are functional identical, but are
+                * controlled by different fields in the ZDP_CTL_FVC
+                * register. If we failed to take one field, try the
+                * rest 3 choices.
+                */
+               BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+               idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+               idx[0] = (idx[0] + 1) % 4;
+               idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+               if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+                       config1 = nhmex_mbox_alter_er(event, idx[0], false);
+                       goto again;
+               }
+       }
+
+       if (alloc & 0x1)
+               nhmex_mbox_put_shared_reg(box, idx[0]);
+       if (alloc & 0x2)
+               nhmex_mbox_put_shared_reg(box, idx[1]);
+       return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+       if (uncore_box_is_fake(box))
+               return;
+
+       if (reg1->alloc & 0x1)
+               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+       if (reg1->alloc & 0x2)
+               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+       reg1->alloc = 0;
+
+       if (reg2->alloc) {
+               nhmex_mbox_put_shared_reg(box, reg2->idx);
+               reg2->alloc = 0;
+       }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+       if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+               return er->idx;
+       return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_type *type = box->pmu->type;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       struct extra_reg *er;
+       unsigned msr;
+       int reg_idx = 0;
+       /*
+        * The mbox events may require 2 extra MSRs at the most. But only
+        * the lower 32 bits in these MSRs are significant, so we can use
+        * config1 to pass two MSRs' config.
+        */
+       for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               if (event->attr.config1 & ~er->valid_mask)
+                       return -EINVAL;
+
+               msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+               if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+                       return -EINVAL;
+
+               /* always use the 32~63 bits to pass the PLD config */
+               if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+                       reg_idx = 1;
+               else if (WARN_ON_ONCE(reg_idx > 0))
+                       return -EINVAL;
+
+               reg1->idx &= ~(0xff << (reg_idx * 8));
+               reg1->reg &= ~(0xffff << (reg_idx * 16));
+               reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+               reg1->reg |= msr << (reg_idx * 16);
+               reg1->config = event->attr.config1;
+               reg_idx++;
+       }
+       /*
+        * The mbox only provides ability to perform address matching
+        * for the PLD events.
+        */
+       if (reg_idx == 2) {
+               reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+               if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+                       reg2->config = event->attr.config2;
+               else
+                       reg2->config = ~0ULL;
+               if (box->pmu->pmu_idx == 0)
+                       reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+               else
+                       reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+       }
+       return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       u64 config;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+               return box->shared_regs[idx].config;
+
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       config = er->config;
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+       return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int idx;
+
+       idx = __BITS_VALUE(reg1->idx, 0, 8);
+       if (idx != 0xff)
+               wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+                       nhmex_mbox_shared_reg_config(box, idx));
+       idx = __BITS_VALUE(reg1->idx, 1, 8);
+       if (idx != 0xff)
+               wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+                       nhmex_mbox_shared_reg_config(box, idx));
+
+       if (reg2->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg2->reg, 0);
+               if (reg2->config != ~0ULL) {
+                       wrmsrl(reg2->reg + 1,
+                               reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+                       wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+                               (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+                       wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+               }
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,          count_mode,     "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,                storage_mode,   "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,           wrap_mode,      "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,           flag_mode,      "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,             inc_sel,        "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,                set_flag_sel,   "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,       filter_cfg_en,  "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,                filter_match,   "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,         filter_mask,    "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,                 dsp,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,                 thr,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,                 fvc,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,                 pgt,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,                 map,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,                 iss,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,                 pld,            "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+       &format_attr_count_mode.attr,
+       &format_attr_storage_mode.attr,
+       &format_attr_wrap_mode.attr,
+       &format_attr_flag_mode.attr,
+       &format_attr_inc_sel.attr,
+       &format_attr_set_flag_sel.attr,
+       &format_attr_filter_cfg_en.attr,
+       &format_attr_filter_match.attr,
+       &format_attr_filter_mask.attr,
+       &format_attr_dsp.attr,
+       &format_attr_thr.attr,
+       &format_attr_fvc.attr,
+       &format_attr_pgt.attr,
+       &format_attr_map.attr,
+       &format_attr_iss.attr,
+       &format_attr_pld.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+       .name           = "format",
+       .attrs          = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+       { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event   = nhmex_mbox_msr_enable_event,
+       .hw_config      = nhmex_mbox_hw_config,
+       .get_constraint = nhmex_mbox_get_constraint,
+       .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+       .name                   = "mbox",
+       .num_counters           = 6,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_M0_MSR_PMU_CTL0,
+       .perf_ctr               = NHMEX_M0_MSR_PMU_CNT0,
+       .event_mask             = NHMEX_M_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_M0_MSR_GLOBAL_CTL,
+       .msr_offset             = NHMEX_M_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 8,
+       .event_descs            = nhmex_uncore_mbox_events,
+       .ops                    = &nhmex_uncore_mbox_ops,
+       .format_group           = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       /* adjust the main event selector and extra register index */
+       if (reg1->idx % 2) {
+               reg1->idx--;
+               hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       } else {
+               reg1->idx++;
+               hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       }
+
+       /* adjust extra register config */
+       switch (reg1->idx % 6) {
+       case 2:
+               /* shift the 8~15 bits to the 0~7 bits */
+               reg1->config >>= 8;
+               break;
+       case 3:
+               /* shift the 0~7 bits to the 8~15 bits */
+               reg1->config <<= 8;
+               break;
+       }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       int idx, er_idx;
+       u64 config1;
+       bool ok = false;
+
+       if (!uncore_box_is_fake(box) && reg1->alloc)
+               return NULL;
+
+       idx = reg1->idx % 6;
+       config1 = reg1->config;
+again:
+       er_idx = idx;
+       /* the 3rd and 4th events use the same extra register */
+       if (er_idx > 2)
+               er_idx--;
+       er_idx += (reg1->idx / 6) * 5;
+
+       er = &box->shared_regs[er_idx];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (idx < 2) {
+               if (!atomic_read(&er->ref) || er->config == reg1->config) {
+                       atomic_inc(&er->ref);
+                       er->config = reg1->config;
+                       ok = true;
+               }
+       } else if (idx == 2 || idx == 3) {
+               /*
+                * these two events use different fields in a extra register,
+                * the 0~7 bits and the 8~15 bits respectively.
+                */
+               u64 mask = 0xff << ((idx - 2) * 8);
+               if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+                               !((er->config ^ config1) & mask)) {
+                       atomic_add(1 << ((idx - 2) * 8), &er->ref);
+                       er->config &= ~mask;
+                       er->config |= config1 & mask;
+                       ok = true;
+               }
+       } else {
+               if (!atomic_read(&er->ref) ||
+                               (er->config == (hwc->config >> 32) &&
+                                er->config1 == reg1->config &&
+                                er->config2 == reg2->config)) {
+                       atomic_inc(&er->ref);
+                       er->config = (hwc->config >> 32);
+                       er->config1 = reg1->config;
+                       er->config2 = reg2->config;
+                       ok = true;
+               }
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (!ok) {
+               /*
+                * The Rbox events are always in pairs. The paired
+                * events are functional identical, but use different
+                * extra registers. If we failed to take an extra
+                * register, try the alternative.
+                */
+               idx ^= 1;
+               if (idx != reg1->idx % 6) {
+                       if (idx == 2)
+                               config1 >>= 8;
+                       else if (idx == 3)
+                               config1 <<= 8;
+                       goto again;
+               }
+       } else {
+               if (!uncore_box_is_fake(box)) {
+                       if (idx != reg1->idx % 6)
+                               nhmex_rbox_alter_er(box, event);
+                       reg1->alloc = 1;
+               }
+               return NULL;
+       }
+       return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       int idx, er_idx;
+
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       idx = reg1->idx % 6;
+       er_idx = idx;
+       if (er_idx > 2)
+               er_idx--;
+       er_idx += (reg1->idx / 6) * 5;
+
+       er = &box->shared_regs[er_idx];
+       if (idx == 2 || idx == 3)
+               atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+       else
+               atomic_dec(&er->ref);
+
+       reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       int idx;
+
+       idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+               NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       if (idx >= 0x18)
+               return -EINVAL;
+
+       reg1->idx = idx;
+       reg1->config = event->attr.config1;
+
+       switch (idx % 6) {
+       case 4:
+       case 5:
+               hwc->config |= event->attr.config & (~0ULL << 32);
+               reg2->config = event->attr.config2;
+               break;
+       }
+       return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int idx, port;
+
+       idx = reg1->idx;
+       port = idx / 6 + box->pmu->pmu_idx * 4;
+
+       switch (idx % 6) {
+       case 0:
+               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+               break;
+       case 1:
+               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+               break;
+       case 2:
+       case 3:
+               wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+                       uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+               break;
+       case 4:
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+                       hwc->config >> 32);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+               break;
+       case 5:
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+                       hwc->config >> 32);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+               break;
+       }
+
+       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+               (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+       &format_attr_event5.attr,
+       &format_attr_xbr_mm_cfg.attr,
+       &format_attr_xbr_match.attr,
+       &format_attr_xbr_mask.attr,
+       &format_attr_qlx_cfg.attr,
+       &format_attr_iperf_cfg.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,         "event=0x0,iperf_cfg=0x80000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,         "event=0x6,iperf_cfg=0x80000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,         "event=0x0,iperf_cfg=0x40000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,         "event=0x6,iperf_cfg=0x40000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi0_date_response,     "event=0x0,iperf_cfg=0xc4"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_date_response,     "event=0x6,iperf_cfg=0xc4"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_rbox_msr_enable_event,
+       .hw_config              = nhmex_rbox_hw_config,
+       .get_constraint         = nhmex_rbox_get_constraint,
+       .put_constraint         = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+       .name                   = "rbox",
+       .num_counters           = 8,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_R_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_R_MSR_PMON_CNT0,
+       .event_mask             = NHMEX_R_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_R_MSR_GLOBAL_CTL,
+       .msr_offset             = NHMEX_R_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 20,
+       .event_descs            = nhmex_uncore_rbox_events,
+       .ops                    = &nhmex_uncore_rbox_ops,
+       .format_group           = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+       &nhmex_uncore_ubox,
+       &nhmex_uncore_cbox,
+       &nhmex_uncore_bbox,
+       &nhmex_uncore_sbox,
+       &nhmex_uncore_mbox,
+       &nhmex_uncore_rbox,
+       &nhmex_uncore_wbox,
+       NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+       if (boot_cpu_data.x86_model == 46)
+               uncore_nhmex = true;
+       else
+               nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+       if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c

new file mode 100644 (file)

index 0000000..96531d2
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -0,0 +1,731 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "uncore.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC    0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC    0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_IMC    0x191f
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK                 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET                   (1 << 18)
+#define SNB_UNC_CTL_EN                         (1 << 22)
+#define SNB_UNC_CTL_INVERT                     (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK                 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK                 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN               (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
+                                                SNB_UNC_CTL_UMASK_MASK | \
+                                                SNB_UNC_CTL_EDGE_DET | \
+                                                SNB_UNC_CTL_INVERT | \
+                                                SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
+                                                SNB_UNC_CTL_UMASK_MASK | \
+                                                SNB_UNC_CTL_EDGE_DET | \
+                                                SNB_UNC_CTL_INVERT | \
+                                                NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0                   0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0                        0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET                 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+       else
+               wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->pmu_idx == 0) {
+               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+                       SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+       }
+}
+
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->pmu_idx == 0)
+               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+       { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask5.attr,
+       NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+       .name           = "format",
+       .attrs          = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+       .init_box       = snb_uncore_msr_init_box,
+       .exit_box       = snb_uncore_msr_exit_box,
+       .disable_event  = snb_uncore_msr_disable_event,
+       .enable_event   = snb_uncore_msr_enable_event,
+       .read_counter   = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+       .name           = "cbox",
+       .num_counters   = 2,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNB_UNC_CBO_0_PER_CTR0,
+       .event_ctl      = SNB_UNC_CBO_0_PERFEVTSEL0,
+       .fixed_ctr      = SNB_UNC_FIXED_CTR,
+       .fixed_ctl      = SNB_UNC_FIXED_CTR_CTRL,
+       .single_fixed   = 1,
+       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
+       .msr_offset     = SNB_UNC_CBO_MSR_OFFSET,
+       .ops            = &snb_uncore_msr_ops,
+       .format_group   = &snb_uncore_format_group,
+       .event_descs    = snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+       .name           = "arb",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .perf_ctr       = SNB_UNC_ARB_PER_CTR0,
+       .event_ctl      = SNB_UNC_ARB_PERFEVTSEL0,
+       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
+       .msr_offset     = SNB_UNC_ARB_MSR_OFFSET,
+       .constraints    = snb_uncore_arb_constraints,
+       .ops            = &snb_uncore_msr_ops,
+       .format_group   = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+       &snb_uncore_cbox,
+       &snb_uncore_arb,
+       NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = snb_msr_uncores;
+       if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+       SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
+       INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+       INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+       INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+       { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK          0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET          0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE            0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS          0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE     0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES         0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE    0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE            SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+       .name = "format",
+       .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+       resource_size_t addr;
+       u32 pci_dword;
+
+       pci_read_config_dword(pdev, where, &pci_dword);
+       addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+       pci_read_config_dword(pdev, where + 4, &pci_dword);
+       addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+       addr &= ~(PAGE_SIZE - 1);
+
+       box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+       box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+       iounmap(box->io_addr);
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+       int idx, base;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       pmu = uncore_event_to_pmu(event);
+       /* no device found for this pmu */
+       if (pmu->func_id < 0)
+               return -ENOENT;
+
+       /* Sampling not supported yet */
+       if (hwc->sample_period)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       /*
+        * Place all uncore events for a particular physical package
+        * onto a single cpu
+        */
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       /* check only supported bits are set */
+       if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+               return -EINVAL;
+
+       box = uncore_pmu_to_box(pmu, event->cpu);
+       if (!box || box->cpu < 0)
+               return -EINVAL;
+
+       event->cpu = box->cpu;
+       event->pmu_private = box;
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+       /*
+        * check event is known (whitelist, determines counter)
+        */
+       switch (cfg) {
+       case SNB_UNCORE_PCI_IMC_DATA_READS:
+               base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+               idx = UNCORE_PMC_IDX_FIXED;
+               break;
+       case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+               base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+               idx = UNCORE_PMC_IDX_FIXED + 1;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* must be done before validate_group */
+       event->hw.event_base = base;
+       event->hw.config = cfg;
+       event->hw.idx = idx;
+
+       /* no group validation needed, we have free running counters */
+
+       return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       u64 count;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       event->hw.state = 0;
+       box->n_active++;
+
+       list_add_tail(&event->active_entry, &box->active_list);
+
+       count = snb_uncore_imc_read_counter(box, event);
+       local64_set(&event->hw.prev_count, count);
+
+       if (box->n_active == 1)
+               uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               box->n_active--;
+
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+
+               list_del(&event->active_entry);
+
+               if (box->n_active == 0)
+                       uncore_pmu_cancel_hrtimer(box);
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               uncore_perf_event_update(box, event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!box)
+               return -ENODEV;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       snb_uncore_imc_event_start(event, 0);
+
+       box->n_events++;
+
+       return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int i;
+
+       snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < box->n_events; i++) {
+               if (event == box->event_list[i]) {
+                       --box->n_events;
+                       break;
+               }
+       }
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+       struct pci_dev *dev = NULL;
+       struct pci2phy_map *map;
+       int bus, segment;
+
+       dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+       if (!dev)
+               return -ENOTTY;
+
+       bus = dev->bus->number;
+       segment = pci_domain_nr(dev->bus);
+
+       raw_spin_lock(&pci2phy_map_lock);
+       map = __find_pci2phy_map(segment);
+       if (!map) {
+               raw_spin_unlock(&pci2phy_map_lock);
+               pci_dev_put(dev);
+               return -ENOMEM;
+       }
+       map->pbus_to_physid[bus] = 0;
+       raw_spin_unlock(&pci2phy_map_lock);
+
+       pci_dev_put(dev);
+
+       return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = snb_uncore_imc_event_init,
+       .add            = snb_uncore_imc_event_add,
+       .del            = snb_uncore_imc_event_del,
+       .start          = snb_uncore_imc_event_start,
+       .stop           = snb_uncore_imc_event_stop,
+       .read           = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+       .init_box       = snb_uncore_imc_init_box,
+       .exit_box       = snb_uncore_imc_exit_box,
+       .enable_box     = snb_uncore_imc_enable_box,
+       .disable_box    = snb_uncore_imc_disable_box,
+       .disable_event  = snb_uncore_imc_disable_event,
+       .enable_event   = snb_uncore_imc_enable_event,
+       .hw_config      = snb_uncore_imc_hw_config,
+       .read_counter   = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .fixed_ctr_bits = 32,
+       .fixed_ctr      = SNB_UNCORE_PCI_IMC_CTR_BASE,
+       .event_descs    = snb_uncore_imc_events,
+       .format_group   = &snb_uncore_imc_format_group,
+       .perf_ctr       = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+       .event_mask     = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+       .ops            = &snb_uncore_imc_ops,
+       .pmu            = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+       [SNB_PCI_UNCORE_IMC]    = &snb_uncore_imc,
+       NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+       .name           = "snb_uncore",
+       .id_table       = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+       .name           = "ivb_uncore",
+       .id_table       = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+       .name           = "hsw_uncore",
+       .id_table       = hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+       .name           = "bdw_uncore",
+       .id_table       = bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+       .name           = "skl_uncore",
+       .id_table       = skl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+       __u32 pci_id;
+       struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+       { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+       IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+       IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
+       IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+       IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
+       IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
+       IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
+       {  /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+       for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+       const struct imc_uncore_pci_dev *p;
+       int ret;
+
+       for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+               ret = snb_pci2phy_map_init(p->pci_id);
+               if (ret == 0)
+                       return p->driver;
+       }
+       return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+       struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+       if (!imc_drv)
+               return -ENODEV;
+
+       uncore_pci_uncores = snb_pci_uncores;
+       uncore_pci_driver = imc_drv;
+
+       return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+       else
+               wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask8.attr,
+       NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+       .name = "format",
+       .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+       INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+       .disable_box    = nhm_uncore_msr_disable_box,
+       .enable_box     = nhm_uncore_msr_enable_box,
+       .disable_event  = snb_uncore_msr_disable_event,
+       .enable_event   = nhm_uncore_msr_enable_event,
+       .read_counter   = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+       .name           = "",
+       .num_counters   = 8,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .event_ctl      = NHM_UNC_PERFEVTSEL0,
+       .perf_ctr       = NHM_UNC_UNCORE_PMC0,
+       .fixed_ctr      = NHM_UNC_FIXED_CTR,
+       .fixed_ctl      = NHM_UNC_FIXED_CTR_CTRL,
+       .event_mask     = NHM_UNC_RAW_EVENT_MASK,
+       .event_descs    = nhm_uncore_events,
+       .ops            = &nhm_uncore_msr_ops,
+       .format_group   = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+       &nhm_uncore,
+       NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c

new file mode 100644 (file)

index 0000000..93f6bd9
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -0,0 +1,3135 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "uncore.h"
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ         (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN      (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+                                        SNBEP_PMON_BOX_CTL_RST_CTRS | \
+                                        SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK     0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK      0x0000ff00
+#define SNBEP_PMON_CTL_RST             (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET                (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT      (1 << 21)
+#define SNBEP_PMON_CTL_EN              (1 << 22)
+#define SNBEP_PMON_CTL_INVERT          (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK      0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK      (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_INVERT | \
+                                        SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK                0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK                \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_UMASK_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN              (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK      (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK    0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK      0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT      (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET    (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_RAW_EVENT_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL                 0xf4
+#define SNBEP_PCI_PMON_CTL0                    0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0                    0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0       0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1       0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH      0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL                0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR                0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0         0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1         0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0          0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1          0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0                  0xc16
+#define SNBEP_U_MSR_PMON_CTL0                  0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL                0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR                0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0                 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0                 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL              0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER           0xd14
+#define SNBEP_CBO_MSR_OFFSET                   0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID      0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID      0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE    0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC      0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {   \
+       .event = (e),                           \
+       .msr = SNBEP_C0_MSR_PMON_BOX_FILTER,    \
+       .config_mask = (m),                     \
+       .idx = (i)                              \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0                        0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0                        0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL             0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER          0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK     0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR              0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR              0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+                                        SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK              (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL            0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL          (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK        \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_UMASK_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK              (IVBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST                (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK               \
+                               (IVBEP_PMON_RAW_EVENT_MASK | \
+                                IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
+                               (IVBEP_PMON_RAW_EVENT_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+                               ((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0                  0x709
+#define HSWEP_U_MSR_PMON_CTL0                  0x705
+#define HSWEP_U_MSR_PMON_FILTER                        0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR                0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID                (0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID                (0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+                                       (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+                                        HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0                 0xe08
+#define HSWEP_C0_MSR_PMON_CTL0                 0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL                      0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0          0xe05
+#define HSWEP_CBO_MSR_OFFSET                   0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0                 0x726
+#define HSWEP_S0_MSR_PMON_CTL0                 0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL                      0x720
+#define HSWEP_SBOX_MSR_OFFSET                  0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0                        0x717
+#define HSWEP_PCU_MSR_PMON_CTL0                        0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL             0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER          0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+                                       (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+                                               SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET                     0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR               (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+                                       (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+                                        KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID                0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE      (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP         (0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW             0x400
+#define KNL_UCLK_MSR_PMON_CTL0                 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL              0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW       0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL       0x454
+#define KNL_PMON_FIXED_CTL_EN                  0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW                0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0            0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL         0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW  0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL  0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW          0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0              0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL           0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW         0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL         0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL               0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK           0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR           (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK                0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK        \
+                               (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+                                KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_CBO_PMON_CTL_TID_EN | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+       u32 config = 0;
+
+       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+               config |= SNBEP_PMON_BOX_CTL_FRZ;
+               pci_write_config_dword(pdev, box_ctl, config);
+       }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+       u32 config = 0;
+
+       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+               pci_write_config_dword(pdev, box_ctl, config);
+       }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+       pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+
+       pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       u64 config;
+       unsigned msr;
+
+       msr = uncore_msr_box_ctl(box);
+       if (msr) {
+               rdmsrl(msr, config);
+               config |= SNBEP_PMON_BOX_CTL_FRZ;
+               wrmsrl(msr, config);
+       }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       u64 config;
+       unsigned msr;
+
+       msr = uncore_msr_box_ctl(box);
+       if (msr) {
+               rdmsrl(msr, config);
+               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+               wrmsrl(msr, config);
+       }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE)
+               wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+
+       if (msr)
+               wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid.attr,
+       &format_attr_filter_nid.attr,
+       &format_attr_filter_state.attr,
+       &format_attr_filter_opc.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge.attr,
+       &format_attr_filter_band0.attr,
+       &format_attr_filter_band1.attr,
+       &format_attr_filter_band2.attr,
+       &format_attr_filter_band3.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match_rds.attr,
+       &format_attr_match_rnid30.attr,
+       &format_attr_match_rnid4.attr,
+       &format_attr_match_dnid.attr,
+       &format_attr_match_mc.attr,
+       &format_attr_match_opc.attr,
+       &format_attr_match_vnw.attr,
+       &format_attr_match0.attr,
+       &format_attr_match1.attr,
+       &format_attr_mask_rds.attr,
+       &format_attr_mask_rnid30.attr,
+       &format_attr_mask_rnid4.attr,
+       &format_attr_mask_dnid.attr,
+       &format_attr_mask_mc.attr,
+       &format_attr_mask_opc.attr,
+       &format_attr_mask_vnw.attr,
+       &format_attr_mask0.attr,
+       &format_attr_mask1.attr,
+       NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+       { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+       INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+       INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
+       INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
+       { /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                   \
+       .disable_box    = snbep_uncore_msr_disable_box,         \
+       .enable_box     = snbep_uncore_msr_enable_box,          \
+       .disable_event  = snbep_uncore_msr_disable_event,       \
+       .enable_event   = snbep_uncore_msr_enable_event,        \
+       .read_counter   = uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
+       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),                   \
+       .init_box       = snbep_uncore_msr_init_box             \
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()                     \
+       .init_box       = snbep_uncore_pci_init_box,            \
+       .disable_box    = snbep_uncore_pci_disable_box,         \
+       .enable_box     = snbep_uncore_pci_enable_box,          \
+       .disable_event  = snbep_uncore_pci_disable_event,       \
+       .read_counter   = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+       .enable_event   = snbep_uncore_pci_enable_event,        \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+       EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
+       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
+       .event_mask     = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops            = &snbep_uncore_msr_ops,
+       .format_group   = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+       EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       int i;
+
+       if (uncore_box_is_fake(box))
+               return;
+
+       for (i = 0; i < 5; i++) {
+               if (reg1->alloc & (0x1 << i))
+                       atomic_sub(1 << (i * 6), &er->ref);
+       }
+       reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+                           u64 (*cbox_filter_mask)(int fields))
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       int i, alloc = 0;
+       unsigned long flags;
+       u64 mask;
+
+       if (reg1->idx == EXTRA_REG_NONE)
+               return NULL;
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       for (i = 0; i < 5; i++) {
+               if (!(reg1->idx & (0x1 << i)))
+                       continue;
+               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+                       continue;
+
+               mask = cbox_filter_mask(0x1 << i);
+               if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+                   !((reg1->config ^ er->config) & mask)) {
+                       atomic_add(1 << (i * 6), &er->ref);
+                       er->config &= ~mask;
+                       er->config |= reg1->config & mask;
+                       alloc |= (0x1 << i);
+               } else {
+                       break;
+               }
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+       if (i < 5)
+               goto fail;
+
+       if (!uncore_box_is_fake(box))
+               reg1->alloc |= alloc;
+
+       return NULL;
+fail:
+       for (; i >= 0; i--) {
+               if (alloc & (0x1 << i))
+                       atomic_sub(1 << (i * 6), &er->ref);
+       }
+       return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x4)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+       return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_cbox_hw_config,
+       .get_constraint         = snbep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = snbep_uncore_cbox_constraints,
+       .ops                    = &snbep_uncore_cbox_ops,
+       .format_group           = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       u64 config = reg1->config;
+
+       if (new_idx > reg1->idx)
+               config <<= 8 * (new_idx - reg1->idx);
+       else
+               config >>= 8 * (reg1->idx - new_idx);
+
+       if (modify) {
+               hwc->config += new_idx - reg1->idx;
+               reg1->config = config;
+               reg1->idx = new_idx;
+       }
+       return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       unsigned long flags;
+       int idx = reg1->idx;
+       u64 mask, config1 = reg1->config;
+       bool ok = false;
+
+       if (reg1->idx == EXTRA_REG_NONE ||
+           (!uncore_box_is_fake(box) && reg1->alloc))
+               return NULL;
+again:
+       mask = 0xffULL << (idx * 8);
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+           !((config1 ^ er->config) & mask)) {
+               atomic_add(1 << (idx * 8), &er->ref);
+               er->config &= ~mask;
+               er->config |= config1 & mask;
+               ok = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (!ok) {
+               idx = (idx + 1) % 4;
+               if (idx != reg1->idx) {
+                       config1 = snbep_pcu_alter_er(event, idx, false);
+                       goto again;
+               }
+               return &uncore_constraint_empty;
+       }
+
+       if (!uncore_box_is_fake(box)) {
+               if (idx != reg1->idx)
+                       snbep_pcu_alter_er(event, idx, true);
+               reg1->alloc = 1;
+       }
+       return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       atomic_sub(1 << (reg1->idx * 8), &er->ref);
+       reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+       if (ev_sel >= 0xb && ev_sel <= 0xe) {
+               reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+               reg1->idx = ev_sel - 0xb;
+               reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+       &snbep_uncore_ubox,
+       &snbep_uncore_cbox,
+       &snbep_uncore_pcu,
+       NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+       if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+       SNBEP_PCI_QPI_PORT0_FILTER,
+       SNBEP_PCI_QPI_PORT1_FILTER,
+       HSWEP_PCI_PCU_3,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+               reg1->idx = 0;
+               reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+               reg1->config = event->attr.config1;
+               reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+               reg2->config = event->attr.config2;
+       }
+       return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+               int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+               struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
+               if (filter_pdev) {
+                       pci_write_config_dword(filter_pdev, reg1->reg,
+                                               (u32)reg1->config);
+                       pci_write_config_dword(filter_pdev, reg1->reg + 4,
+                                               (u32)(reg1->config >> 32));
+                       pci_write_config_dword(filter_pdev, reg2->reg,
+                                               (u32)reg2->config);
+                       pci_write_config_dword(filter_pdev, reg2->reg + 4,
+                                               (u32)(reg2->config >> 32));
+               }
+       }
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+       .enable_event           = snbep_qpi_enable_event,
+       .hw_config              = snbep_qpi_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()                         \
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,            \
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
+       .ops            = &snbep_uncore_pci_ops,                \
+       .format_group   = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = snbep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .event_descs            = snbep_uncore_qpi_events,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       SNBEP_PCI_UNCORE_HA,
+       SNBEP_PCI_UNCORE_IMC,
+       SNBEP_PCI_UNCORE_QPI,
+       SNBEP_PCI_UNCORE_R2PCIE,
+       SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+       [SNBEP_PCI_UNCORE_HA]           = &snbep_uncore_ha,
+       [SNBEP_PCI_UNCORE_IMC]          = &snbep_uncore_imc,
+       [SNBEP_PCI_UNCORE_QPI]          = &snbep_uncore_qpi,
+       [SNBEP_PCI_UNCORE_R2PCIE]       = &snbep_uncore_r2pcie,
+       [SNBEP_PCI_UNCORE_R3QPI]        = &snbep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+       { /* Home Agent */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+       },
+       { /* MC Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* QPI Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+       .name           = "snbep_uncore",
+       .id_table       = snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+       struct pci_dev *ubox_dev = NULL;
+       int i, bus, nodeid, segment;
+       struct pci2phy_map *map;
+       int err = 0;
+       u32 config = 0;
+
+       while (1) {
+               /* find the UBOX device */
+               ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+               if (!ubox_dev)
+                       break;
+               bus = ubox_dev->bus->number;
+               /* get the Node ID of the local register */
+               err = pci_read_config_dword(ubox_dev, 0x40, &config);
+               if (err)
+                       break;
+               nodeid = config;
+               /* get the Node ID mapping */
+               err = pci_read_config_dword(ubox_dev, 0x54, &config);
+               if (err)
+                       break;
+
+               segment = pci_domain_nr(ubox_dev->bus);
+               raw_spin_lock(&pci2phy_map_lock);
+               map = __find_pci2phy_map(segment);
+               if (!map) {
+                       raw_spin_unlock(&pci2phy_map_lock);
+                       err = -ENOMEM;
+                       break;
+               }
+
+               /*
+                * every three bits in the Node ID mapping register maps
+                * to a particular node.
+                */
+               for (i = 0; i < 8; i++) {
+                       if (nodeid == ((config >> (3 * i)) & 0x7)) {
+                               map->pbus_to_physid[bus] = i;
+                               break;
+                       }
+               }
+               raw_spin_unlock(&pci2phy_map_lock);
+       }
+
+       if (!err) {
+               /*
+                * For PCI bus with no UBOX device, find the next bus
+                * that has UBOX device and use its mapping.
+                */
+               raw_spin_lock(&pci2phy_map_lock);
+               list_for_each_entry(map, &pci2phy_map_head, list) {
+                       i = -1;
+                       for (bus = 255; bus >= 0; bus--) {
+                               if (map->pbus_to_physid[bus] >= 0)
+                                       i = map->pbus_to_physid[bus];
+                               else
+                                       map->pbus_to_physid[bus] = i;
+                       }
+               }
+               raw_spin_unlock(&pci2phy_map_lock);
+       }
+
+       pci_dev_put(ubox_dev);
+
+       return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x3ce0);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = snbep_pci_uncores;
+       uncore_pci_driver = &snbep_uncore_pci_driver;
+       return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       if (msr)
+               wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+
+       pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
+       .init_box       = ivbep_uncore_msr_init_box,            \
+       .disable_box    = snbep_uncore_msr_disable_box,         \
+       .enable_box     = snbep_uncore_msr_enable_box,          \
+       .disable_event  = snbep_uncore_msr_disable_event,       \
+       .enable_event   = snbep_uncore_msr_enable_event,        \
+       .read_counter   = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_uncore_pci_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT()                         \
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
+       .event_mask     = IVBEP_PMON_RAW_EVENT_MASK,            \
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
+       .ops            = &ivbep_uncore_pci_ops,                        \
+       .format_group   = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid.attr,
+       &format_attr_filter_link.attr,
+       &format_attr_filter_state2.attr,
+       &format_attr_filter_nid2.attr,
+       &format_attr_filter_opc2.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge.attr,
+       &format_attr_filter_band0.attr,
+       &format_attr_filter_band1.attr,
+       &format_attr_filter_band2.attr,
+       &format_attr_filter_band3.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match_rds.attr,
+       &format_attr_match_rnid30.attr,
+       &format_attr_match_rnid4.attr,
+       &format_attr_match_dnid.attr,
+       &format_attr_match_mc.attr,
+       &format_attr_match_opc.attr,
+       &format_attr_match_vnw.attr,
+       &format_attr_match0.attr,
+       &format_attr_match1.attr,
+       &format_attr_mask_rds.attr,
+       &format_attr_mask_rnid30.attr,
+       &format_attr_mask_rnid4.attr,
+       &format_attr_mask_dnid.attr,
+       &format_attr_mask_mc.attr,
+       &format_attr_mask_opc.attr,
+       &format_attr_mask_vnw.attr,
+       &format_attr_mask0.attr,
+       &format_attr_mask1.attr,
+       NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
+       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
+       .event_mask     = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops            = &ivbep_uncore_msr_ops,
+       .format_group   = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+       EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x10) {
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+       }
+
+       return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               u64 filter = uncore_shared_reg_config(box, 0);
+               wrmsrl(reg1->reg, filter & 0xffffffff);
+               wrmsrl(reg1->reg + 6, filter >> 32);
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+       .init_box               = ivbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = ivbep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = ivbep_cbox_hw_config,
+       .get_constraint         = ivbep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 15,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
+       .event_mask             = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = snbep_uncore_cbox_constraints,
+       .ops                    = &ivbep_uncore_cbox_ops,
+       .format_group           = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_pcu_ops,
+       .format_group           = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+       &ivbep_uncore_ubox,
+       &ivbep_uncore_cbox,
+       &ivbep_uncore_pcu,
+       NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+       if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = snbep_uncore_imc_events,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+                              hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = ivbep_uncore_irp_disable_event,
+       .enable_event   = ivbep_uncore_irp_enable_event,
+       .read_counter   = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = IVBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &ivbep_uncore_irp_ops,
+       .format_group           = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_qpi_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+       .hw_config      = snbep_qpi_hw_config,
+       .get_constraint = uncore_get_constraint,
+       .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_qpi_ops,
+       .format_group           = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r2pcie_constraints,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r3qpi_constraints,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       IVBEP_PCI_UNCORE_HA,
+       IVBEP_PCI_UNCORE_IMC,
+       IVBEP_PCI_UNCORE_IRP,
+       IVBEP_PCI_UNCORE_QPI,
+       IVBEP_PCI_UNCORE_R2PCIE,
+       IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+       [IVBEP_PCI_UNCORE_HA]   = &ivbep_uncore_ha,
+       [IVBEP_PCI_UNCORE_IMC]  = &ivbep_uncore_imc,
+       [IVBEP_PCI_UNCORE_IRP]  = &ivbep_uncore_irp,
+       [IVBEP_PCI_UNCORE_QPI]  = &ivbep_uncore_qpi,
+       [IVBEP_PCI_UNCORE_R2PCIE]       = &ivbep_uncore_r2pcie,
+       [IVBEP_PCI_UNCORE_R3QPI]        = &ivbep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 4 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 4 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+       .name           = "ivbep_uncore",
+       .id_table       = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x0e1e);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = ivbep_pci_uncores;
+       uncore_pci_driver = &ivbep_uncore_pci_driver;
+       return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops                    = &snbep_uncore_msr_ops,
+       .format_group           = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_qor.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid4.attr,
+       &format_attr_filter_link3.attr,
+       &format_attr_filter_state4.attr,
+       &format_attr_filter_local.attr,
+       &format_attr_filter_all_op.attr,
+       &format_attr_filter_nnm.attr,
+       &format_attr_filter_opc3.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+       EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x4)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+       return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+                            struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+                                   struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+       .init_box               = snbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = knl_cha_hw_config,
+       .get_constraint         = knl_cha_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+       .name                   = "cha",
+       .num_counters           = 4,
+       .num_boxes              = 38,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = KNL_CHA_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = knl_uncore_cha_constraints,
+       .ops                    = &knl_uncore_cha_ops,
+       .format_group           = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+       &format_attr_event2.attr,
+       &format_attr_use_occ_ctr.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh6.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge_det.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .ops                    = &snbep_uncore_msr_ops,
+       .format_group           = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+       &knl_uncore_ubox,
+       &knl_uncore_cha,
+       &knl_uncore_pcu,
+       NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+
+       pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+                                                       == UNCORE_FIXED_EVENT)
+               pci_write_config_dword(pdev, hwc->config_base,
+                                      hwc->config | KNL_PMON_FIXED_CTL_EN);
+       else
+               pci_write_config_dword(pdev, hwc->config_base,
+                                      hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+       .init_box       = snbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = knl_uncore_imc_enable_box,
+       .read_counter   = snbep_uncore_pci_read_counter,
+       .enable_event   = knl_uncore_imc_enable_event,
+       .disable_event  = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+       .name                   = "imc_uclk",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+       .name                   = "imc",
+       .num_counters           = 4,
+       .num_boxes              = 6,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_MC0_CH0_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+       .fixed_ctl              = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+       .box_ctl                = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+       .name                   = "edc_uclk",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+       .name                   = "edc_eclk",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+       .fixed_ctl              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+       .box_ctl                = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+       .name           = "m2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = knl_uncore_m2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_qor.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = KNL_IRP_PCI_PMON_BOX_CTL,
+       .ops                    = &snbep_uncore_pci_ops,
+       .format_group           = &knl_uncore_irp_format_group,
+};
+
+enum {
+       KNL_PCI_UNCORE_MC_UCLK,
+       KNL_PCI_UNCORE_MC_DCLK,
+       KNL_PCI_UNCORE_EDC_UCLK,
+       KNL_PCI_UNCORE_EDC_ECLK,
+       KNL_PCI_UNCORE_M2PCIE,
+       KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+       [KNL_PCI_UNCORE_MC_UCLK]        = &knl_uncore_imc_uclk,
+       [KNL_PCI_UNCORE_MC_DCLK]        = &knl_uncore_imc_dclk,
+       [KNL_PCI_UNCORE_EDC_UCLK]       = &knl_uncore_edc_uclk,
+       [KNL_PCI_UNCORE_EDC_ECLK]       = &knl_uncore_edc_eclk,
+       [KNL_PCI_UNCORE_M2PCIE]         = &knl_uncore_m2pcie,
+       [KNL_PCI_UNCORE_IRP]            = &knl_uncore_irp,
+       NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *     PCI Device ID   Uncore PMU Devices
+ *     ----------------------------------
+ *     0x7841          MC0 UClk, MC1 UClk
+ *     0x7843          MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *                     MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *     0x7833          EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *                     EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *     0x7835          EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *                     EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *     0x7817          M2PCIe
+ *     0x7814          IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+       { /* MC UClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+       },
+       { /* MC DClk Channel */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+       },
+       { /* EDC UClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+       },
+       { /* EDC EClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+       },
+       { /* M2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+       .name           = "knl_uncore",
+       .id_table       = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+       int ret;
+
+       /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+       ret = snb_pci2phy_map_init(0x7814); /* IRP */
+       if (ret)
+               return ret;
+       ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+       if (ret)
+               return ret;
+       uncore_pci_uncores = knl_pci_uncores;
+       uncore_pci_driver = &knl_uncore_pci_driver;
+       return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_filter_tid2.attr,
+       &format_attr_filter_cid.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+       reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+       reg1->idx = 0;
+       return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_ubox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 44,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &hswep_uncore_ubox_ops,
+       .format_group           = &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid3.attr,
+       &format_attr_filter_link2.attr,
+       &format_attr_filter_state3.attr,
+       &format_attr_filter_nid2.attr,
+       &format_attr_filter_opc2.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+       EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+       if (fields & 0x1)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x10) {
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+       }
+       return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+                                 struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               u64 filter = uncore_shared_reg_config(box, 0);
+               wrmsrl(reg1->reg, filter & 0xffffffff);
+               wrmsrl(reg1->reg + 1, filter >> 32);
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+       .init_box               = snbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = hswep_cbox_hw_config,
+       .get_constraint         = hswep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 18,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = hswep_uncore_cbox_constraints,
+       .ops                    = &hswep_uncore_cbox_ops,
+       .format_group           = &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+
+       if (msr) {
+               u64 init = SNBEP_PMON_BOX_CTL_INT;
+               u64 flags = 0;
+               int i;
+
+               for_each_set_bit(i, (unsigned long *)&init, 64) {
+                       flags |= (1ULL << i);
+                       wrmsrl(msr, flags);
+               }
+       }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .init_box               = hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_sbox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 4,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
+       .ops                    = &hswep_uncore_sbox_msr_ops,
+       .format_group           = &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+       if (ev_sel >= 0xb && ev_sel <= 0xe) {
+               reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+               reg1->idx = ev_sel - 0xb;
+               reg1->config = event->attr.config1 & (0xff << reg1->idx);
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &hswep_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+       &hswep_uncore_ubox,
+       &hswep_uncore_cbox,
+       &hswep_uncore_sbox,
+       &hswep_uncore_pcu,
+       NULL,
+};
+
+void hswep_uncore_cpu_init(void)
+{
+       int pkg = topology_phys_to_logical_pkg(0);
+
+       if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+       /* Detect 6-8 core systems with only two SBOXes */
+       if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
+               u32 capid4;
+
+               pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
+                                     0x94, &capid4);
+               if (((capid4 >> 6) & 0x3) == 0)
+                       hswep_uncore_sbox.num_boxes = 2;
+       }
+
+       uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 5,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 5,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+       .init_box       = snbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = ivbep_uncore_irp_disable_event,
+       .enable_event   = ivbep_uncore_irp_enable_event,
+       .read_counter   = hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &hswep_uncore_irp_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 5,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = hswep_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 4,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 44,
+       .constraints    = hswep_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       HSWEP_PCI_UNCORE_HA,
+       HSWEP_PCI_UNCORE_IMC,
+       HSWEP_PCI_UNCORE_IRP,
+       HSWEP_PCI_UNCORE_QPI,
+       HSWEP_PCI_UNCORE_R2PCIE,
+       HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+       [HSWEP_PCI_UNCORE_HA]   = &hswep_uncore_ha,
+       [HSWEP_PCI_UNCORE_IMC]  = &hswep_uncore_imc,
+       [HSWEP_PCI_UNCORE_IRP]  = &hswep_uncore_irp,
+       [HSWEP_PCI_UNCORE_QPI]  = &hswep_uncore_qpi,
+       [HSWEP_PCI_UNCORE_R2PCIE]       = &hswep_uncore_r2pcie,
+       [HSWEP_PCI_UNCORE_R3QPI]        = &hswep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 1 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* PCU.3 (for Capability registers) */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  HSWEP_PCI_PCU_3),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+       .name           = "hswep_uncore",
+       .id_table       = hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x2f1e);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = hswep_pci_uncores;
+       uncore_pci_driver = &hswep_uncore_pci_driver;
+       return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_msr_ops,
+       .format_group           = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 24,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = bdx_uncore_cbox_constraints,
+       .ops                    = &hswep_uncore_cbox_ops,
+       .format_group           = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 4,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
+       .ops                    = &hswep_uncore_sbox_msr_ops,
+       .format_group           = &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX    3
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+       &bdx_uncore_ubox,
+       &bdx_uncore_cbox,
+       &hswep_uncore_pcu,
+       &bdx_uncore_sbox,
+       NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+       if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = bdx_msr_uncores;
+
+       /* BDX-DE doesn't have SBOX */
+       if (boot_cpu_data.x86_model == 86)
+               uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 5,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &hswep_uncore_irp_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = bdx_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 48,
+       .constraints    = bdx_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       BDX_PCI_UNCORE_HA,
+       BDX_PCI_UNCORE_IMC,
+       BDX_PCI_UNCORE_IRP,
+       BDX_PCI_UNCORE_QPI,
+       BDX_PCI_UNCORE_R2PCIE,
+       BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+       [BDX_PCI_UNCORE_HA]     = &bdx_uncore_ha,
+       [BDX_PCI_UNCORE_IMC]    = &bdx_uncore_imc,
+       [BDX_PCI_UNCORE_IRP]    = &bdx_uncore_irp,
+       [BDX_PCI_UNCORE_QPI]    = &bdx_uncore_qpi,
+       [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+       [BDX_PCI_UNCORE_R3QPI]  = &bdx_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+       },
+       { /* QPI Port 1 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+       },
+       { /* QPI Port 2 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+       .name           = "bdx_uncore",
+       .id_table       = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x6f1e);
+
+       if (ret)
+               return ret;
+       uncore_pci_uncores = bdx_pci_uncores;
+       uncore_pci_driver = &bdx_uncore_pci_driver;
+       return 0;
+}
+
+/* end of BDX uncore support */
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c

new file mode 100644 (file)

index 0000000..ec863b9
--- /dev/null
+++ b/arch/x86/events/msr.c
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+       PERF_MSR_TSC                    = 0,
+       PERF_MSR_APERF                  = 1,
+       PERF_MSR_MPERF                  = 2,
+       PERF_MSR_PPERF                  = 3,
+       PERF_MSR_SMI                    = 4,
+
+       PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+       return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_MSR_SMI)
+                       return true;
+               break;
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+struct perf_msr {
+       u64     msr;
+       struct  perf_pmu_events_attr *attr;
+       bool    (*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+       [PERF_MSR_TSC]   = { 0,                 &evattr_tsc,    NULL,            },
+       [PERF_MSR_APERF] = { MSR_IA32_APERF,    &evattr_aperf,  test_aperfmperf, },
+       [PERF_MSR_MPERF] = { MSR_IA32_MPERF,    &evattr_mperf,  test_aperfmperf, },
+       [PERF_MSR_PPERF] = { MSR_PPERF,         &evattr_pperf,  test_intel,      },
+       [PERF_MSR_SMI]   = { MSR_SMI_COUNT,     &evattr_smi,    test_intel,      },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group events_attr_group = {
+       .name = "events",
+       .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+static struct attribute_group format_attr_group = {
+       .name = "format",
+       .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+       &events_attr_group,
+       &format_attr_group,
+       NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       if (cfg >= PERF_MSR_EVENT_MAX)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       if (!msr[cfg].attr)
+               return -EINVAL;
+
+       event->hw.idx = -1;
+       event->hw.event_base = msr[cfg].msr;
+       event->hw.config = cfg;
+
+       return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+       u64 now;
+
+       if (event->hw.event_base)
+               rdmsrl(event->hw.event_base, now);
+       else
+               rdtscll(now);
+
+       return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+       u64 prev, now;
+       s64 delta;
+
+       /* Careful, an NMI might modify the previous event value. */
+again:
+       prev = local64_read(&event->hw.prev_count);
+       now = msr_read_counter(event);
+
+       if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+               goto again;
+
+       delta = now - prev;
+       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+               delta = sign_extend64(delta, 31);
+
+       local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+       u64 now;
+
+       now = msr_read_counter(event);
+       local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+       msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+       msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               msr_event_start(event, flags);
+
+       return 0;
+}
+
+static struct pmu pmu_msr = {
+       .task_ctx_nr    = perf_sw_context,
+       .attr_groups    = attr_groups,
+       .event_init     = msr_event_init,
+       .add            = msr_event_add,
+       .del            = msr_event_del,
+       .start          = msr_event_start,
+       .stop           = msr_event_stop,
+       .read           = msr_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+       int i, j = 0;
+
+       if (!boot_cpu_has(X86_FEATURE_TSC)) {
+               pr_cont("no MSR PMU driver.\n");
+               return 0;
+       }
+
+       /* Probe the MSRs. */
+       for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+               u64 val;
+
+               /*
+                * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+                */
+               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+                       msr[i].attr = NULL;
+       }
+
+       /* List remaining MSRs in the sysfs attrs. */
+       for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+               if (msr[i].attr)
+                       events_attrs[j++] = &msr[i].attr->attr.attr;
+       }
+       events_attrs[j] = NULL;
+
+       perf_pmu_register(&pmu_msr, "msr", -1);
+
+       return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h

new file mode 100644 (file)

index 0000000..68155ca
--- /dev/null
+++ b/arch/x86/events/perf_event.h
@@ -0,0 +1,960 @@
+/*
+ * Performance events x86 architecture header
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+       EXTRA_REG_NONE  = -1,   /* not used */
+
+       EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
+       EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
+       EXTRA_REG_LBR   = 2,    /* lbr_select */
+       EXTRA_REG_LDLAT = 3,    /* ld_lat_threshold */
+       EXTRA_REG_FE    = 4,    /* fe_* */
+
+       EXTRA_REG_MAX           /* number of entries needed */
+};
+
+struct event_constraint {
+       union {
+               unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+               u64             idxmsk64;
+       };
+       u64     code;
+       u64     cmask;
+       int     weight;
+       int     overlap;
+       int     flags;
+};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT      0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST         0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW     0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED       0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW     0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW     0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
+
+
+struct amd_nb {
+       int nb_id;  /* NorthBridge id */
+       int refcnt; /* reference count */
+       struct perf_event *owners[X86_PMC_IDX_MAX];
+       struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS                8
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+       PERF_SAMPLE_TRANSACTION)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+       u64     bts_buffer_base;
+       u64     bts_index;
+       u64     bts_absolute_maximum;
+       u64     bts_interrupt_threshold;
+       u64     pebs_buffer_base;
+       u64     pebs_index;
+       u64     pebs_absolute_maximum;
+       u64     pebs_interrupt_threshold;
+       u64     pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+/*
+ * Per register state.
+ */
+struct er_account {
+       raw_spinlock_t          lock;   /* per-core: protect structure */
+       u64                 config;     /* extra MSR config */
+       u64                 reg;        /* extra MSR number */
+       atomic_t            ref;        /* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+       struct er_account       regs[EXTRA_REG_MAX];
+       int                     refcnt;         /* per-core: #HT threads */
+       unsigned                core_id;        /* per-core: core id */
+};
+
+enum intel_excl_state_type {
+       INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+       INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+       INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+       enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+       bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+       raw_spinlock_t  lock;
+
+       struct intel_excl_states states[2];
+
+       union {
+               u16     has_exclusive[2];
+               u32     exclusive_present;
+       };
+
+       int             refcnt;         /* per-core: #HT threads */
+       unsigned        core_id;        /* per-core: core id */
+};
+
+#define MAX_LBR_ENTRIES                32
+
+enum {
+       X86_PERF_KFREE_SHARED = 0,
+       X86_PERF_KFREE_EXCL   = 1,
+       X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+       /*
+        * Generic x86 PMC bits
+        */
+       struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
+       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       int                     enabled;
+
+       int                     n_events; /* the # of events in the below arrays */
+       int                     n_added;  /* the # last events in the below arrays;
+                                            they've never been enabled yet */
+       int                     n_txn;    /* the # last events in the below arrays;
+                                            added in the current transaction */
+       int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+       u64                     tags[X86_PMC_IDX_MAX];
+
+       struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+       int                     n_excl; /* the number of exclusive events */
+
+       unsigned int            txn_flags;
+       int                     is_fake;
+
+       /*
+        * Intel DebugStore bits
+        */
+       struct debug_store      *ds;
+       u64                     pebs_enabled;
+
+       /*
+        * Intel LBR bits
+        */
+       int                             lbr_users;
+       void                            *lbr_context;
+       struct perf_branch_stack        lbr_stack;
+       struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
+       struct er_account               *lbr_sel;
+       u64                             br_sel;
+
+       /*
+        * Intel host/guest exclude bits
+        */
+       u64                             intel_ctrl_guest_mask;
+       u64                             intel_ctrl_host_mask;
+       struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
+
+       /*
+        * Intel checkpoint mask
+        */
+       u64                             intel_cp_status;
+
+       /*
+        * manage shared (per-core, per-cpu) registers
+        * used on Intel NHM/WSM/SNB
+        */
+       struct intel_shared_regs        *shared_regs;
+       /*
+        * manage exclusive counter access between hyperthread
+        */
+       struct event_constraint *constraint_list; /* in enable order */
+       struct intel_excl_cntrs         *excl_cntrs;
+       int excl_thread_id; /* 0 or 1 */
+
+       /*
+        * AMD specific bits
+        */
+       struct amd_nb                   *amd_nb;
+       /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+       u64                             perf_ctr_virt_mask;
+
+       void                            *kfree_on_online[X86_PERF_KFREE_MAX];
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+       { .idxmsk64 = (n) },            \
+       .code = (c),                    \
+       .cmask = (m),                   \
+       .weight = (w),                  \
+       .overlap = (o),                 \
+       .flags = f,                     \
+}
+
+#define EVENT_CONSTRAINT(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+       __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+                          0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n)   \
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  - in_tx
+ *  - in_tx_checkpointed
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n)   \
+       EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n)  \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)      \
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)    \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+                          HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n)     \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n)     \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
+       EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c)        \
+       for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+       unsigned int            event;
+       unsigned int            msr;
+       u64                     config_mask;
+       u64                     valid_mask;
+       int                     idx;  /* per_xxx->regs[] reg index */
+       bool                    extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {     \
+       .event = (e),                   \
+       .msr = (ms),                    \
+       .config_mask = (m),             \
+       .valid_mask = (vm),             \
+       .idx = EXTRA_REG_##i,           \
+       .extra_msr_access = true,       \
+       }
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)     \
+       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+                       ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+       INTEL_UEVENT_EXTRA_REG(c, \
+                              MSR_PEBS_LD_LAT_THRESHOLD, \
+                              0xffff, \
+                              LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+       struct {
+               u64     lbr_format:6;
+               u64     pebs_trap:1;
+               u64     pebs_arch_reg:1;
+               u64     pebs_format:4;
+               u64     smm_freeze:1;
+               /*
+                * PMU supports separate counter range for writing
+                * values > 32bit.
+                */
+               u64     full_width_write:1;
+       };
+       u64     capabilities;
+};
+
+struct x86_pmu_quirk {
+       struct x86_pmu_quirk *next;
+       void (*func)(void);
+};
+
+union x86_pmu_config {
+       struct {
+               u64 event:8,
+                   umask:8,
+                   usr:1,
+                   os:1,
+                   edge:1,
+                   pc:1,
+                   interrupt:1,
+                   __reserved1:1,
+                   en:1,
+                   inv:1,
+                   cmask:8,
+                   event2:4,
+                   __reserved2:4,
+                   go:1,
+                   ho:1;
+       } bits;
+       u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+       x86_lbr_exclusive_lbr,
+       x86_lbr_exclusive_bts,
+       x86_lbr_exclusive_pt,
+       x86_lbr_exclusive_max,
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+       /*
+        * Generic x86 PMC bits
+        */
+       const char      *name;
+       int             version;
+       int             (*handle_irq)(struct pt_regs *);
+       void            (*disable_all)(void);
+       void            (*enable_all)(int added);
+       void            (*enable)(struct perf_event *);
+       void            (*disable)(struct perf_event *);
+       int             (*hw_config)(struct perf_event *event);
+       int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+       unsigned        eventsel;
+       unsigned        perfctr;
+       int             (*addr_offset)(int index, bool eventsel);
+       int             (*rdpmc_index)(int index);
+       u64             (*event_map)(int);
+       int             max_events;
+       int             num_counters;
+       int             num_counters_fixed;
+       int             cntval_bits;
+       u64             cntval_mask;
+       union {
+                       unsigned long events_maskl;
+                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+       };
+       int             events_mask_len;
+       int             apic;
+       u64             max_period;
+       struct event_constraint *
+                       (*get_event_constraints)(struct cpu_hw_events *cpuc,
+                                                int idx,
+                                                struct perf_event *event);
+
+       void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
+                                                struct perf_event *event);
+
+       void            (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+       void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+       struct event_constraint *event_constraints;
+       struct x86_pmu_quirk *quirks;
+       int             perfctr_second_write;
+       bool            late_ack;
+       unsigned        (*limit_period)(struct perf_event *event, unsigned l);
+
+       /*
+        * sysfs attrs
+        */
+       int             attr_rdpmc_broken;
+       int             attr_rdpmc;
+       struct attribute **format_attrs;
+       struct attribute **event_attrs;
+
+       ssize_t         (*events_sysfs_show)(char *page, u64 config);
+       struct attribute **cpu_events;
+
+       /*
+        * CPU Hotplug hooks
+        */
+       int             (*cpu_prepare)(int cpu);
+       void            (*cpu_starting)(int cpu);
+       void            (*cpu_dying)(int cpu);
+       void            (*cpu_dead)(int cpu);
+
+       void            (*check_microcode)(void);
+       void            (*sched_task)(struct perf_event_context *ctx,
+                                     bool sched_in);
+
+       /*
+        * Intel Arch Perfmon v2+
+        */
+       u64                     intel_ctrl;
+       union perf_capabilities intel_cap;
+
+       /*
+        * Intel DebugStore bits
+        */
+       unsigned int    bts             :1,
+                       bts_active      :1,
+                       pebs            :1,
+                       pebs_active     :1,
+                       pebs_broken     :1,
+                       pebs_prec_dist  :1;
+       int             pebs_record_size;
+       int             pebs_buffer_size;
+       void            (*drain_pebs)(struct pt_regs *regs);
+       struct event_constraint *pebs_constraints;
+       void            (*pebs_aliases)(struct perf_event *event);
+       int             max_pebs_events;
+       unsigned long   free_running_flags;
+
+       /*
+        * Intel LBR
+        */
+       unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+       int             lbr_nr;                    /* hardware stack size */
+       u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
+       const int       *lbr_sel_map;              /* lbr_select mappings */
+       bool            lbr_double_abort;          /* duplicated lbr aborts */
+
+       /*
+        * Intel PT/LBR/BTS are exclusive
+        */
+       atomic_t        lbr_exclusive[x86_lbr_exclusive_max];
+
+       /*
+        * Extra registers for events
+        */
+       struct extra_reg *extra_regs;
+       unsigned int flags;
+
+       /*
+        * Intel host/guest support (KVM)
+        */
+       struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+};
+
+struct x86_perf_task_context {
+       u64 lbr_from[MAX_LBR_ENTRIES];
+       u64 lbr_to[MAX_LBR_ENTRIES];
+       u64 lbr_info[MAX_LBR_ENTRIES];
+       int tos;
+       int lbr_callstack_users;
+       int lbr_stack_state;
+};
+
+#define x86_add_quirk(func_)                                           \
+do {                                                                   \
+       static struct x86_pmu_quirk __quirk __initdata = {              \
+               .func = func_,                                          \
+       };                                                              \
+       __quirk.next = x86_pmu.quirks;                                  \
+       x86_pmu.quirks = &__quirk;                                      \
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING   0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1       0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS      0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED    0x8 /* exclusive counter active */
+
+#define EVENT_VAR(_id)  event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id)                                         \
+static struct perf_pmu_events_attr EVENT_VAR(_id) = {                  \
+       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+       .id             = PERF_COUNT_HW_##_id,                          \
+       .event_str      = NULL,                                         \
+};
+
+#define EVENT_ATTR_STR(_name, v, str)                                  \
+static struct perf_pmu_events_attr event_attr_##v = {                  \
+       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+       .id             = 0,                                            \
+       .event_str      = str,                                          \
+};
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+       return  x86_pmu.lbr_sel_map &&
+               x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+       return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+                                  x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+       return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+                                 x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+       return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+                                         u64 enable_mask)
+{
+       u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+       if (hwc->extra_reg.reg)
+               wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+       wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+       return ip > PAGE_OFFSET;
+#else
+       return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+       regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+       if (regs->flags & X86_VM_MASK)
+               regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+       regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+       return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+       if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+           !event->attr.freq && event->hw.sample_period == 1)
+               return true;
+
+       return false;
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event);
+
+struct intel_shared_regs *allocate_shared_regs(int cpu);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_enable(struct perf_event *event);
+
+void intel_pmu_lbr_disable(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+void intel_pmu_pebs_data_source_nhm(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+                         char *page);
+
+static inline int is_ht_workaround_enabled(void)
+{
+       return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+       return 0;
+}
+
+static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+       return NULL;
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+       return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h

index 7bfc85bbb8ffc0578011ceac2c08548bd140ade3..99afb665a004cb8ea82c1751f6ef29b12d116cc6 100644 (file)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -151,12 +151,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
         ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)                    \
         ".popsection"
  
-/*
- * This must be included *after* the definition of ALTERNATIVE due to
- * <asm/arch_hweight.h>
- */
-#include <asm/cpufeature.h>
-
  /*
   * Alternative instructions for different CPU types or capabilities.
   *
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h

index 3c56ef1ae068e8c52b829bab6719d47fba0ef8bd..5e828da2e18f6e9b4ee05e0abab7997b2fe9c6d3 100644 (file)
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
  };
  
  struct threshold_block {
-       unsigned int            block;
-       unsigned int            bank;
-       unsigned int            cpu;
-       u32                     address;
-       u16                     interrupt_enable;
-       bool                    interrupt_capable;
-       u16                     threshold_limit;
-       struct kobject          kobj;
-       struct list_head        miscj;
+       unsigned int     block;                 /* Number within bank */
+       unsigned int     bank;                  /* MCA bank the block belongs to */
+       unsigned int     cpu;                   /* CPU which controls MCA bank */
+       u32              address;               /* MSR address for the block */
+       u16              interrupt_enable;      /* Enable/Disable APIC interrupt */
+       bool             interrupt_capable;     /* Bank can generate an interrupt. */
+
+       u16              threshold_limit;       /*
+                                                * Value upon which threshold
+                                                * interrupt is generated.
+                                                */
+
+       struct kobject   kobj;                  /* sysfs object */
+       struct list_head miscj;                 /*
+                                                * List of threshold blocks
+                                                * within a bank.
+                                                */
  };
  
  struct threshold_bank {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h

index c80f6b6f3da222dc86dc00747429b618589502eb..0899cfc8dfe8f84f98edc6a0c19e7ff40225ae74 100644 (file)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -6,7 +6,6 @@
  
  #include <asm/alternative.h>
  #include <asm/cpufeature.h>
-#include <asm/processor.h>
  #include <asm/apicdef.h>
  #include <linux/atomic.h>
  #include <asm/fixmap.h>
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h

index 259a7c1ef7099bc89b30cdb0ebbd3a311fa88e18..02e799fa43d1b19c878290f6424e1b2f7293074d 100644 (file)
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -1,6 +1,8 @@
  #ifndef _ASM_X86_HWEIGHT_H
  #define _ASM_X86_HWEIGHT_H
  
+#include <asm/cpufeatures.h>
+
  #ifdef CONFIG_64BIT
  /* popcnt %edi, %eax -- redundant REX prefix for alignment */
  #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h

index 189679aba703537393b87d699a4e11cbfe94e23c..f5063b6659eb37f1a835bc5e614cf251ac48e032 100644 (file)
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
  
  /* Exception table entry */
  #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)                                 \
+# define _ASM_EXTABLE_HANDLE(from, to, handler)                        \
         .pushsection "__ex_table","a" ;                         \
-       .balign 8 ;                                             \
+       .balign 4 ;                                             \
         .long (from) - . ;                                      \
         .long (to) - . ;                                        \
+       .long (handler) - . ;                                   \
         .popsection
  
-# define _ASM_EXTABLE_EX(from,to)                              \
-       .pushsection "__ex_table","a" ;                         \
-       .balign 8 ;                                             \
-       .long (from) - . ;                                      \
-       .long (to) - . + 0x7ffffff0 ;                           \
-       .popsection
+# define _ASM_EXTABLE(from, to)                                        \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)                          \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)                             \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
  
  # define _ASM_NOKPROBE(entry)                                  \
         .pushsection "_kprobe_blacklist","aw" ;                 \
@@ -89,19 +92,24 @@
         .endm
  
  #else
-# define _ASM_EXTABLE(from,to)                                 \
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)                        \
         " .pushsection \"__ex_table\",\"a\"\n"                  \
-       " .balign 8\n"                                          \
+       " .balign 4\n"                                          \
         " .long (" #from ") - .\n"                              \
         " .long (" #to ") - .\n"                                \
+       " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"    \
         " .popsection\n"
  
-# define _ASM_EXTABLE_EX(from,to)                              \
-       " .pushsection \"__ex_table\",\"a\"\n"                  \
-       " .balign 8\n"                                          \
-       " .long (" #from ") - .\n"                              \
-       " .long (" #to ") - . + 0x7ffffff0\n"                   \
-       " .popsection\n"
+# define _ASM_EXTABLE(from, to)                                        \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)                          \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)                             \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
  /* For C file, we already have NOKPROBE_SYMBOL macro */
  #endif
  
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h

index a584e1c50918406a0398cf4a013432e47abf143e..bfb28caf97b1be1f2d6aa8893bd905a39030e310 100644 (file)
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -6,18 +6,17 @@
  
  /*
   * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
+ * And yes, this might be required on UP too when we're talking
   * to devices.
   */
  
  #ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
+                                     X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
+                                      X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \
+                                      X86_FEATURE_XMM2) ::: "memory", "cc")
  #else
  #define mb()   asm volatile("mfence":::"memory")
  #define rmb()  asm volatile("lfence":::"memory")
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h

index cfe3b954d5e41cbd96be1fdb147c164b63581814..7766d1cf096e80d56562d63876f8ca65df869199 100644 (file)
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
   * If it's called on the same region of memory simultaneously, the effect
   * may be that only one operation succeeds.
   */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
  {
         asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
  }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
   * clear_bit() is atomic and implies release semantics before the memory
   * operation. It can be used for an unlock.
   */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
  {
         barrier();
         clear_bit(nr, addr);
  }
  
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
  {
         asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
  }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
   * No memory barrier is required here, because x86 cannot reorder stores past
   * older loads. Same principle as spin_unlock.
   */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
  {
         barrier();
         __clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
   * If it's called on the same region of memory simultaneously, the effect
   * may be that only one operation succeeds.
   */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
  {
         asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
  }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
   * Note that @nr may be almost arbitrarily large; this function is not
   * restricted to acting on a single-word quantity.
   */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
  {
         if (IS_IMMEDIATE(nr)) {
                 asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
   * This operation is atomic and cannot be reordered.
   * It also implies a memory barrier.
   */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
  {
         GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
  }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
   * If two examples of this operation race, one can appear to succeed
   * but actually fail.  You must protect multiple accesses with a lock.
   */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
  {
         int oldbit;
  
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
   * This operation is atomic and cannot be reordered.
   * It also implies a memory barrier.
   */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  {
         GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
  }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
   * accessed from a hypervisor on the same CPU if running in a VM: don't change
   * this without also updating arch/x86/kernel/kvm.c
   */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
  {
         int oldbit;
  
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
  }
  
  /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
  {
         int oldbit;
  
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
   * This operation is atomic and cannot be reordered.
   * It also implies a memory barrier.
   */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
  {
         GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
  }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
                 (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
  }
  
-static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
  {
         int oldbit;
  
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
   *
   * Undefined if no bit exists, so code should check against 0 first.
   */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
  {
         asm("rep; bsf %1,%0"
                 : "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
   *
   * Undefined if no zero exists, so code should check against ~0UL first.
   */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
  {
         asm("rep; bsf %1,%0"
                 : "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
   *
   * Undefined if no set bit exists, so code should check against 0 first.
   */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
  {
         asm("bsr %1,%0"
             : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
   * set bit if value is nonzero. The first (least significant) bit
   * is at position 1.
   */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
  {
         int r;
  
@@ -434,7 +434,7 @@ static inline int ffs(int x)
   * set bit if value is nonzero. The last (most significant) bit is
   * at position 32.
   */
-static inline int fls(int x)
+static __always_inline int fls(int x)
  {
         int r;
  
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h

index e63aa38e85fb23375ecefa40efe8cd9d76da6c43..61518cf79437679788bd41a1ee1c942aea29a8cb 100644 (file)
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,16 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
  
  #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
  extern const int rodata_test_data;
  extern int kernel_set_to_readonly;
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
  
  #ifdef CONFIG_DEBUG_RODATA_TEST
  int rodata_test(void);
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h

index eda81dc0f4ae091c5ff085450ff277f68aa933a9..d194266acb28e52d237c19c21291633c15d99c9e 100644 (file)
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,10 +3,11 @@
  #ifndef _ASM_X86_CLOCKSOURCE_H
  #define _ASM_X86_CLOCKSOURCE_H
  
-#define VCLOCK_NONE 0  /* No vDSO clock available.     */
-#define VCLOCK_TSC  1  /* vDSO should use vread_tsc.   */
-#define VCLOCK_HPET 2  /* vDSO should use vread_hpet.  */
-#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_NONE    0  /* No vDSO clock available.  */
+#define VCLOCK_TSC     1  /* vDSO should use vread_tsc.        */
+#define VCLOCK_HPET    2  /* vDSO should use vread_hpet.       */
+#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_MAX     3
  
  struct arch_clocksource_data {
         int vclock_mode;
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h

index ad19841eddfe142fec6b2b27dc329059d8fb7296..9733361fed6f4f7368d92f98d21b0d0edfbf6261 100644 (file)
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -2,6 +2,7 @@
  #define ASM_X86_CMPXCHG_H
  
  #include <linux/compiler.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative.h> /* Provides LOCK_PREFIX */
  
  /*
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h

index 7ad8c946429776b6dad30238e9d5e2d65a1a3c87..68e4e8258b84126d59094ea1023568a8272dcdbd 100644 (file)
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -1,288 +1,7 @@
-/*
- * Defines x86 CPU feature bits
- */
  #ifndef _ASM_X86_CPUFEATURE_H
  #define _ASM_X86_CPUFEATURE_H
  
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#include <asm/required-features.h>
-#endif
-
-#ifndef _ASM_X86_DISABLED_FEATURES_H
-#include <asm/disabled-features.h>
-#endif
-
-#define NCAPINTS       16      /* N 32-bit words worth of info */
-#define NBUGINTS       1       /* N 32-bit bug flags */
-
-/*
- * Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name.  If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
- */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU                ( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME                ( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE         ( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE                ( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC                ( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR                ( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE                ( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE                ( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8                ( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC       ( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP                ( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR       ( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE                ( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA                ( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV       ( 0*32+15) /* CMOV instructions */
-                                         /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT                ( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36      ( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN         ( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH    ( 0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS         ( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI       ( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX                ( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR       ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM                ( 0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2       ( 0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP  ( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT         ( 0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC                ( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64       ( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE                ( 0*32+31) /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL    ( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP         ( 1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX         ( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT     ( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT   ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES    ( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP     ( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM         ( 1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT   ( 1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW      ( 1*32+31) /* 3DNow! */
-
-/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY   ( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN    ( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI       ( 2*32+ 3) /* LongRun table interface */
-
-/* Other features, Linux-defined mapping, word 3 */
-/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX      ( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR    ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR  ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR        ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8         ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7         ( 3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3         ( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4         ( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP         ( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS       ( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS                ( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32  ( 3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD   ( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP  ( 3*32+19) * "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL       ( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS     ( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY  ( 3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC        ( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID        ( 3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ  ( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64     ( 4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT      ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL      ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX                ( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX                ( 4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST                ( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2                ( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3      ( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID                ( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG       ( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA                ( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16       ( 4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR       ( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM       ( 4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID       ( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA                ( 4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1     ( 4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2     ( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC     ( 4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE      ( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES                ( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE      ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE    ( 4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX                ( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C       ( 4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND     ( 4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
-
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE     ( 5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN  ( 5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT     ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN  ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2       ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN    ( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE                ( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN     ( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM                ( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN     ( 5*32+13) /* PMM enabled */
-
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM    ( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM                ( 6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC    ( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM                ( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A      ( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW       ( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS                ( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP                ( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT     ( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT                ( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP                ( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4       ( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE                ( 6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM                ( 6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT    ( 6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT      (6*32+26) /* data breakpoint extension */
-#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
-#define X86_FEATURE_MWAITX     ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
-
-/*
- * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7.
- *
- * Reuse free bits when adding new feature flags!
- */
-
-#define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-
-#define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-
-#define X86_FEATURE_INTEL_PT   ( 7*32+15) /* Intel Processor Trace */
-
-/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
-
-#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
-#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
-
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE   ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1       ( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE                ( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2       ( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP       ( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2       ( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS       ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID    ( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM                ( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM                ( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_MPX                ( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F    ( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED     ( 9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX                ( 9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP       ( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_PCOMMIT    ( 9*32+22) /* PCOMMIT instruction */
-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB       ( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI     ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-
-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
-#define X86_FEATURE_XSAVEOPT   (10*32+ 0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC     (10*32+ 1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1    (10*32+ 2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES     (10*32+ 3) /* XSAVES/XRSTORS */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
-#define X86_FEATURE_CQM_LLC    (11*32+ 1) /* LLC QoS if 1 */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
-
-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
-#define X86_FEATURE_CLZERO     (13*32+0) /* CLZERO instruction */
-
-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
-#define X86_FEATURE_DTHERM     (14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA                (14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT       (14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN                (14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS                (14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP                (14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP    (14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-
-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
-#define X86_FEATURE_NPT                (15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV       (15*32+ 1) /* LBR Virtualization support */
-#define X86_FEATURE_SVML       (15*32+ 2) /* "svm_lock" SVM locking MSR */
-#define X86_FEATURE_NRIPS      (15*32+ 3) /* "nrip_save" SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-
-/*
- * BUG word(s)
- */
-#define X86_BUG(x)             (NCAPINTS*32 + (x))
-
-#define X86_BUG_F00F           X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV           X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA           X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E   X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP           X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK    X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR        X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS        X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#include <asm/processor.h>
  
  #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
  
@@ -369,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
   * is not relevant.
   */
  #define cpu_feature_enabled(bit)       \
-       (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 :  \
-        cpu_has(&boot_cpu_data, bit))
+       (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
  
  #define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
  
@@ -406,106 +124,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  #define cpu_has_osxsave                boot_cpu_has(X86_FEATURE_OSXSAVE)
  #define cpu_has_hypervisor     boot_cpu_has(X86_FEATURE_HYPERVISOR)
  /*
- * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * Do not add any more of those clumsy macros - use static_cpu_has() for
   * fast paths and boot_cpu_has() otherwise!
   */
  
-#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-extern void warn_pre_alternatives(void);
-extern bool __static_cpu_has_safe(u16 bit);
-
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
  /*
   * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These are only valid after alternatives have run, but will statically
- * patch the target code for additional performance.
+ * These will statically patch the target code for additional
+ * performance.
   */
-static __always_inline __pure bool __static_cpu_has(u16 bit)
-{
-#ifdef CC_HAVE_ASM_GOTO
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-
-               /*
-                * Catch too early usage of this before alternatives
-                * have run.
-                */
-               asm_volatile_goto("1: jmp %l[t_warn]\n"
-                        "2:\n"
-                        ".section .altinstructions,\"a\"\n"
-                        " .long 1b - .\n"
-                        " .long 0\n"           /* no replacement */
-                        " .word %P0\n"         /* 1: do replace */
-                        " .byte 2b - 1b\n"     /* source len */
-                        " .byte 0\n"           /* replacement len */
-                        " .byte 0\n"           /* pad len */
-                        ".previous\n"
-                        /* skipping size check since replacement size = 0 */
-                        : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
-
-#endif
-
-               asm_volatile_goto("1: jmp %l[t_no]\n"
-                        "2:\n"
-                        ".section .altinstructions,\"a\"\n"
-                        " .long 1b - .\n"
-                        " .long 0\n"           /* no replacement */
-                        " .word %P0\n"         /* feature bit */
-                        " .byte 2b - 1b\n"     /* source len */
-                        " .byte 0\n"           /* replacement len */
-                        " .byte 0\n"           /* pad len */
-                        ".previous\n"
-                        /* skipping size check since replacement size = 0 */
-                        : : "i" (bit) : : t_no);
-               return true;
-       t_no:
-               return false;
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-       t_warn:
-               warn_pre_alternatives();
-               return false;
-#endif
-
-#else /* CC_HAVE_ASM_GOTO */
-
-               u8 flag;
-               /* Open-coded due to __stringify() in ALTERNATIVE() */
-               asm volatile("1: movb $0,%0\n"
-                            "2:\n"
-                            ".section .altinstructions,\"a\"\n"
-                            " .long 1b - .\n"
-                            " .long 3f - .\n"
-                            " .word %P1\n"             /* feature bit */
-                            " .byte 2b - 1b\n"         /* source len */
-                            " .byte 4f - 3f\n"         /* replacement len */
-                            " .byte 0\n"               /* pad len */
-                            ".previous\n"
-                            ".section .discard,\"aw\",@progbits\n"
-                            " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-                            ".previous\n"
-                            ".section .altinstr_replacement,\"ax\"\n"
-                            "3: movb $1,%0\n"
-                            "4:\n"
-                            ".previous\n"
-                            : "=qm" (flag) : "i" (bit));
-               return flag;
-
-#endif /* CC_HAVE_ASM_GOTO */
-}
-
-#define static_cpu_has(bit)                                    \
-(                                                              \
-       __builtin_constant_p(boot_cpu_has(bit)) ?               \
-               boot_cpu_has(bit) :                             \
-       __builtin_constant_p(bit) ?                             \
-               __static_cpu_has(bit) :                         \
-               boot_cpu_has(bit)                               \
-)
-
-static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
+static __always_inline __pure bool _static_cpu_has(u16 bit)
  {
-#ifdef CC_HAVE_ASM_GOTO
-               asm_volatile_goto("1: jmp %l[t_dynamic]\n"
+               asm_volatile_goto("1: jmp 6f\n"
                          "2:\n"
                          ".skip -(((5f-4f) - (2b-1b)) > 0) * "
                                  "((5f-4f) - (2b-1b)),0x90\n"
@@ -530,66 +161,34 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
                          " .byte 0\n"                   /* repl len */
                          " .byte 0\n"                   /* pad len */
                          ".previous\n"
-                        : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
-                        : : t_dynamic, t_no);
+                        ".section .altinstr_aux,\"ax\"\n"
+                        "6:\n"
+                        " testb %[bitnum],%[cap_byte]\n"
+                        " jnz %l[t_yes]\n"
+                        " jmp %l[t_no]\n"
+                        ".previous\n"
+                        : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+                            [bitnum] "i" (1 << (bit & 7)),
+                            [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+                        : : t_yes, t_no);
+       t_yes:
                 return true;
         t_no:
                 return false;
-       t_dynamic:
-               return __static_cpu_has_safe(bit);
-#else
-               u8 flag;
-               /* Open-coded due to __stringify() in ALTERNATIVE() */
-               asm volatile("1: movb $2,%0\n"
-                            "2:\n"
-                            ".section .altinstructions,\"a\"\n"
-                            " .long 1b - .\n"          /* src offset */
-                            " .long 3f - .\n"          /* repl offset */
-                            " .word %P2\n"             /* always replace */
-                            " .byte 2b - 1b\n"         /* source len */
-                            " .byte 4f - 3f\n"         /* replacement len */
-                            " .byte 0\n"               /* pad len */
-                            ".previous\n"
-                            ".section .discard,\"aw\",@progbits\n"
-                            " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-                            ".previous\n"
-                            ".section .altinstr_replacement,\"ax\"\n"
-                            "3: movb $0,%0\n"
-                            "4:\n"
-                            ".previous\n"
-                            ".section .altinstructions,\"a\"\n"
-                            " .long 1b - .\n"          /* src offset */
-                            " .long 5f - .\n"          /* repl offset */
-                            " .word %P1\n"             /* feature bit */
-                            " .byte 4b - 3b\n"         /* src len */
-                            " .byte 6f - 5f\n"         /* repl len */
-                            " .byte 0\n"               /* pad len */
-                            ".previous\n"
-                            ".section .discard,\"aw\",@progbits\n"
-                            " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
-                            ".previous\n"
-                            ".section .altinstr_replacement,\"ax\"\n"
-                            "5: movb $1,%0\n"
-                            "6:\n"
-                            ".previous\n"
-                            : "=qm" (flag)
-                            : "i" (bit), "i" (X86_FEATURE_ALWAYS));
-               return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
-#endif /* CC_HAVE_ASM_GOTO */
  }
  
-#define static_cpu_has_safe(bit)                               \
+#define static_cpu_has(bit)                                    \
  (                                                              \
         __builtin_constant_p(boot_cpu_has(bit)) ?               \
                 boot_cpu_has(bit) :                             \
-               _static_cpu_has_safe(bit)                       \
+               _static_cpu_has(bit)                            \
  )
  #else
  /*
- * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
   */
  #define static_cpu_has(bit)            boot_cpu_has(bit)
-#define static_cpu_has_safe(bit)       boot_cpu_has(bit)
  #endif
  
  #define cpu_has_bug(c, bit)            cpu_has(c, (bit))
@@ -597,7 +196,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
  #define clear_cpu_bug(c, bit)          clear_cpu_cap(c, (bit))
  
  #define static_cpu_has_bug(bit)                static_cpu_has((bit))
-#define static_cpu_has_bug_safe(bit)   static_cpu_has_safe((bit))
  #define boot_cpu_has_bug(bit)          cpu_has_bug(&boot_cpu_data, (bit))
  
  #define MAX_CPU_FEATURES               (NCAPINTS * 32)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h

new file mode 100644 (file)

index 0000000..9e0567f
--- /dev/null
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,300 @@
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#include <asm/required-features.h>
+#endif
+
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS       16      /* N 32-bit words worth of info */
+#define NBUGINTS       1       /* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU                ( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME                ( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE         ( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE                ( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC                ( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR                ( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE                ( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE                ( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8                ( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC       ( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP                ( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR       ( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE                ( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA                ( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV       ( 0*32+15) /* CMOV instructions */
+                                         /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT                ( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36      ( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN         ( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH    ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS         ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI       ( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX                ( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR       ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM                ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2       ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP  ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT         ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC                ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64       ( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE                ( 0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL    ( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP         ( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX         ( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT     ( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT   ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES    ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP     ( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM         ( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT   ( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW      ( 1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY   ( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN    ( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI       ( 2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX      ( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR    ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR  ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR        ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8         ( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7         ( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3         ( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4         ( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP         ( 3*32+ 9) /* smp kernel running on up */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS       ( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS                ( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32  ( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD   ( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP  ( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL       ( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS     ( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY  ( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC        ( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID        ( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ  ( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64     ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT      ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL      ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX                ( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX                ( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST                ( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2                ( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3      ( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID                ( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG       ( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA                ( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16       ( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR       ( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM       ( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID       ( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA                ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1     ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2     ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC     ( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE      ( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES                ( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE      ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE    ( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX                ( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C       ( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND     ( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE     ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN  ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT     ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN  ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2       ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN    ( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE                ( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN     ( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM                ( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN     ( 5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM    ( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM                ( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC    ( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM                ( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A      ( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW       ( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS                ( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP                ( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT     ( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT                ( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP                ( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4       ( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE                ( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM                ( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT    ( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT      (6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX     ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+
+#define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+
+#define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+
+#define X86_FEATURE_INTEL_PT   ( 7*32+15) /* Intel Processor Trace */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE   ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1       ( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE                ( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2       ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP       ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2       ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS       ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID    ( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM                ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM                ( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX                ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F    ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ   ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED     ( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX                ( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP       ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT    ( 9*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB       ( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI     ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW   ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL   ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT   (10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC     (10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1    (10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES     (10*32+ 3) /* XSAVES/XRSTORS */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC    (11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO     (13*32+0) /* CLZERO instruction */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM     (14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA                (14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT       (14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN                (14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS                (14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP                (14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP    (14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT                (15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV       (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML       (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS      (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC       (15*32+13) /* Virtual Interrupt Controller */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x)             (NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F           X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV           X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA           X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E   X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP           X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK    X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR        X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS        X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX.  Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX         X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+
+#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h

index 278441f39856edf31edca4f5391479c356b19a61..eb5deb42484d5e283adeaa191c004d1566f11a7f 100644 (file)
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -98,4 +98,27 @@ struct desc_ptr {
  
  #endif /* !__ASSEMBLY__ */
  
+/* Access rights as returned by LAR */
+#define AR_TYPE_RODATA         (0 * (1 << 9))
+#define AR_TYPE_RWDATA         (1 * (1 << 9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9))
+#define AR_TYPE_XOCODE         (4 * (1 << 9))
+#define AR_TYPE_XRCODE         (5 * (1 << 9))
+#define AR_TYPE_XOCODE_CONF    (6 * (1 << 9))
+#define AR_TYPE_XRCODE_CONF    (7 * (1 << 9))
+#define AR_TYPE_MASK           (7 * (1 << 9))
+
+#define AR_DPL0                        (0 * (1 << 13))
+#define AR_DPL3                        (3 * (1 << 13))
+#define AR_DPL_MASK            (3 * (1 << 13))
+
+#define AR_A                   (1 << 8)   /* "Accessed" */
+#define AR_S                   (1 << 12)  /* If clear, "System" segment */
+#define AR_P                   (1 << 15)  /* "Present" */
+#define AR_AVL                 (1 << 20)  /* "AVaiLable" (no HW effect) */
+#define AR_L                   (1 << 21)  /* "Long mode" for code segments */
+#define AR_DB                  (1 << 22)  /* D/B, effect depends on type */
+#define AR_G                   (1 << 23)  /* "Granularity" (limit in pages) */
+
  #endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h

index 1514753fd43553e079696712b48a8d08b6966e98..15340e36ddcb3364e16eb63cd61c61a42676d756 100644 (file)
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -256,7 +256,7 @@ extern int force_personality32;
     instruction set this CPU supports.  This could be done in user space,
     but it's not easy, and we've already done it here.  */
  
-#define ELF_HWCAP              (boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP              (boot_cpu_data.x86_capability[CPUID_1_EDX])
  
  /* This yields a string that ld.so will use to load implementation
     specific libraries for optimization.  This is more specific in
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h

index 0fd440df63f18d77c040f8683f34018ed426b4b2..c2e46eb96b6d8449c66482005dcad47ccf88e7f4 100644 (file)
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -17,6 +17,7 @@
  #include <asm/user.h>
  #include <asm/fpu/api.h>
  #include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
  
  /*
   * High level FPU state handling functions:
@@ -58,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
   */
  static __always_inline __pure bool use_eager_fpu(void)
  {
-       return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+       return static_cpu_has(X86_FEATURE_EAGER_FPU);
  }
  
  static __always_inline __pure bool use_xsaveopt(void)
  {
-       return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+       return static_cpu_has(X86_FEATURE_XSAVEOPT);
  }
  
  static __always_inline __pure bool use_xsave(void)
  {
-       return static_cpu_has_safe(X86_FEATURE_XSAVE);
+       return static_cpu_has(X86_FEATURE_XSAVE);
  }
  
  static __always_inline __pure bool use_fxsr(void)
  {
-       return static_cpu_has_safe(X86_FEATURE_FXSR);
+       return static_cpu_has(X86_FEATURE_FXSR);
  }
  
  /*
@@ -300,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
  
         WARN_ON(system_state != SYSTEM_BOOTING);
  
-       if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+       if (static_cpu_has(X86_FEATURE_XSAVES))
                 XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
         else
                 XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
@@ -322,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
  
         WARN_ON(system_state != SYSTEM_BOOTING);
  
-       if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+       if (static_cpu_has(X86_FEATURE_XSAVES))
                 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
         else
                 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -460,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
          * pending. Clear the x87 state here by setting it to fixed values.
          * "m" is a random variable that should be in L1.
          */
-       if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+       if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                 asm volatile(
                         "fnclex\n\t"
                         "emms\n\t"
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h

index af30fdeb140da75691e91c52446f7f1bb7fe2e93..f23cd8c80b1c818057053e831ec704ab4006fba4 100644 (file)
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,16 +20,15 @@
  
  /* Supported features which support lazy state saving */
  #define XFEATURE_MASK_LAZY     (XFEATURE_MASK_FP | \
-                                XFEATURE_MASK_SSE)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER    (XFEATURE_MASK_BNDREGS | \
-                                XFEATURE_MASK_BNDCSR | \
+                                XFEATURE_MASK_SSE | \
                                  XFEATURE_MASK_YMM | \
                                  XFEATURE_MASK_OPMASK | \
                                  XFEATURE_MASK_ZMM_Hi256 | \
                                  XFEATURE_MASK_Hi16_ZMM)
  
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER    (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
  /* All currently supported features */
  #define XCNTXT_MASK    (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
  
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h

index 793179cf8e21aa89636f869fc3a9e2fe0b4a29e0..6e4d170726b758a75a46777f416d07bc7ada01e7 100644 (file)
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,23 +1,44 @@
-#ifdef __ASSEMBLY__
+#ifndef _ASM_X86_FRAME_H
+#define _ASM_X86_FRAME_H
  
  #include <asm/asm.h>
  
-/* The annotation hides the frame from the unwinder and makes it look
-   like a ordinary ebp save/restore. This avoids some special cases for
-   frame pointer later */
+/*
+ * These are stack frame creation macros.  They should be used by every
+ * callable non-leaf asm function to make kernel stack traces more reliable.
+ */
+
  #ifdef CONFIG_FRAME_POINTER
-       .macro FRAME
-       __ASM_SIZE(push,)       %__ASM_REG(bp)
-       __ASM_SIZE(mov)         %__ASM_REG(sp), %__ASM_REG(bp)
-       .endm
-       .macro ENDFRAME
-       __ASM_SIZE(pop,)        %__ASM_REG(bp)
-       .endm
-#else
-       .macro FRAME
-       .endm
-       .macro ENDFRAME
-       .endm
-#endif
-
-#endif  /*  __ASSEMBLY__  */
+
+#ifdef __ASSEMBLY__
+
+.macro FRAME_BEGIN
+       push %_ASM_BP
+       _ASM_MOV %_ASM_SP, %_ASM_BP
+.endm
+
+.macro FRAME_END
+       pop %_ASM_BP
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define FRAME_BEGIN                            \
+       "push %" _ASM_BP "\n"                   \
+       _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
+
+#define FRAME_END "pop %" _ASM_BP "\n"
+
+#endif /* __ASSEMBLY__ */
+
+#define FRAME_OFFSET __ASM_SEL(4, 8)
+
+#else /* !CONFIG_FRAME_POINTER */
+
+#define FRAME_BEGIN
+#define FRAME_END
+#define FRAME_OFFSET 0
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_FRAME_H */
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h

index 78162f8e248bdc77c2625238f821791d837d24d4..d0afb05c84fc1ffd434e85f6267326b9091d2595 100644 (file)
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -1,7 +1,7 @@
  #ifndef _ASM_IRQ_WORK_H
  #define _ASM_IRQ_WORK_H
  
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  
  static inline bool arch_irq_work_has_interrupt(void)
  {
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h

index c1adf33fdd0d6f70f055b9a056bc7787bda7635e..bc62e7cbf1b1f883fc9acb0a145305384a3bf062 100644 (file)
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
  }
  #endif /* CONFIG_KVM_GUEST */
  
-#ifdef CONFIG_DEBUG_RODATA
  #define KVM_HYPERCALL \
          ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
  
  /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
   * instruction.  The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h

index 2ea4527e462f133398f178a885a4b920892cd3f5..92b6f651fa4fcc7e58d5764f317f9db8b0a5616b 100644 (file)
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,8 +40,20 @@
  #define MCI_STATUS_AR   (1ULL<<55)  /* Action required */
  
  /* AMD-specific bits */
-#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* uncorrected error, deferred exception */
  #define MCI_STATUS_POISON      (1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC         (1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX                0x1
+#define MCI_IPID_MCATYPE       0xFFFF0000
+#define MCI_IPID_HWID          0xFFF
  
  /*
   * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -91,6 +103,16 @@
  #define MCE_LOG_LEN 32
  #define MCE_LOG_SIGNATURE      "MACHINECHECK"
  
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0       0xc0002003
+#define MSR_AMD64_SMCA_MC0_CONFIG      0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID                0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1       0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x)     (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)   (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)     (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+
  /*
   * This structure contains all data related to the MCE log.  Also
   * carries a signature to make it easier to find from external
@@ -113,6 +135,7 @@ struct mca_config {
         bool ignore_ce;
         bool disabled;
         bool ser;
+       bool recovery;
         bool bios_cmci_threshold;
         u8 banks;
         s8 bootlog;
@@ -287,4 +310,49 @@ struct cper_sec_mem_err;
  extern void apei_mce_report_mem_error(int corrected,
                                       struct cper_sec_mem_err *mem_err);
  
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+       SMCA_F17H_CORE = 0,     /* Core errors */
+       SMCA_DF,                /* Data Fabric */
+       SMCA_UMC,               /* Unified Memory Controller */
+       SMCA_PB,                /* Parameter Block */
+       SMCA_PSP,               /* Platform Security Processor */
+       SMCA_SMU,               /* System Management Unit */
+       N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+       const char *name;
+       unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+       SMCA_LS = 0,    /* Load Store */
+       SMCA_IF,        /* Instruction Fetch */
+       SMCA_L2_CACHE,  /* L2 cache */
+       SMCA_DE,        /* Decoder unit */
+       RES,            /* Reserved */
+       SMCA_EX,        /* Execution unit */
+       SMCA_FP,        /* Floating Point */
+       SMCA_L3_CACHE,  /* L3 cache */
+       N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+       SMCA_CS = 0,    /* Coherent Slave */
+       SMCA_PIE,       /* Power management, Interrupts, etc */
+       N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
  #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h

index 55234d5e7160db83bd9854a40c87b8e70593b3c7..1ea0baef1175c407e9097e125a84b226be28266c 100644 (file)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,7 +19,8 @@ typedef struct {
  #endif
  
         struct mutex lock;
-       void __user *vdso;
+       void __user *vdso;                      /* vdso base address */
+       const struct vdso_image *vdso_image;    /* vdso image in use */
  
         atomic_t perf_rdpmc_allowed;    /* nonzero if rdpmc is allowed */
  } mm_context_t;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

index b05402ef3b842203f97def484806bec3c2320b9c..984ab75bf621889fb1d3cd1128b4bf1bc39f43ec 100644 (file)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1,7 +1,12 @@
  #ifndef _ASM_X86_MSR_INDEX_H
  #define _ASM_X86_MSR_INDEX_H
  
-/* CPU model specific register (MSR) numbers */
+/*
+ * CPU model specific register (MSR) numbers.
+ *
+ * Do not add new entries to this file unless the definitions are shared
+ * between multiple compilation units.
+ */
  
  /* x86-64 specific MSRs */
  #define MSR_EFER               0xc0000080 /* extended feature register */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h

index c70689b5e5aa4c07f06aabd4edd3d344df207c3e..0deeb2d26df7cd7510e831d04d4f80ed297acaf0 100644 (file)
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -3,6 +3,8 @@
  
  #include <linux/sched.h>
  
+#include <asm/cpufeature.h>
+
  #define MWAIT_SUBSTATE_MASK            0xf
  #define MWAIT_CSTATE_MASK              0xf
  #define MWAIT_SUBSTATE_SIZE            4
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h

index 7bcb861a04e5cf5b32c2c26919756fa29353cb07..5a2ed3ed2f261893d5de08022ea3259198c8c0fe 100644 (file)
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
  #define GLOBAL_STATUS_ASIF                             BIT_ULL(60)
  #define GLOBAL_STATUS_COUNTERS_FROZEN                  BIT_ULL(59)
  #define GLOBAL_STATUS_LBRS_FROZEN                      BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI                    BIT_ULL(55)
  
  /*
   * IBS cpuid feature detection
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 20c11d1aa4ccce11b0709c3fe56092a5505151cf..983738ac014c6508f499b26650d44b6f5b3b1b2b 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -13,7 +13,7 @@ struct vm86;
  #include <asm/types.h>
  #include <uapi/asm/sigcontext.h>
  #include <asm/current.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/page.h>
  #include <asm/pgtable_types.h>
  #include <asm/percpu.h>
@@ -24,7 +24,6 @@ struct vm86;
  #include <asm/fpu/types.h>
  
  #include <linux/personality.h>
-#include <linux/cpumask.h>
  #include <linux/cache.h>
  #include <linux/threads.h>
  #include <linux/math64.h>
@@ -129,6 +128,8 @@ struct cpuinfo_x86 {
         u16                     booted_cores;
         /* Physical processor id: */
         u16                     phys_proc_id;
+       /* Logical processor id: */
+       u16                     logical_proc_id;
         /* Core id: */
         u16                     cpu_core_id;
         /* Compute unit id */
@@ -298,10 +299,13 @@ struct tss_struct {
          */
         unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
  
+#ifdef CONFIG_X86_32
         /*
-        * Space for the temporary SYSENTER stack:
+        * Space for the temporary SYSENTER stack.
          */
+       unsigned long           SYSENTER_stack_canary;
         unsigned long           SYSENTER_stack[64];
+#endif
  
  } ____cacheline_aligned;
  
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h

index a4a77286cb1ddf68b466e934e9f69cb1785f3062..9b9b30b1944187c8c3de3ed5c4ef1d82ea7c422d 100644 (file)
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -7,12 +7,23 @@
  
  void syscall_init(void);
  
+#ifdef CONFIG_X86_64
  void entry_SYSCALL_64(void);
-void entry_SYSCALL_compat(void);
+#endif
+
+#ifdef CONFIG_X86_32
  void entry_INT80_32(void);
-void entry_INT80_compat(void);
  void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
  void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_compat(void);
+#endif
  
  void x86_configure_nx(void);
  void x86_report_nx(void);
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h

index 0a5242428659045cfb439d3045593cc1c63aad96..13b6cdd0af57049468e47ef884e6eccba49dca0e 100644 (file)
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
  extern char __brk_base[], __brk_limit[];
  extern struct exception_table_entry __stop___ex_table[];
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
  extern char __end_rodata_hpage_align[];
  #endif
  
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h

index 89db46752a8f0e1c78c9403ef4b40a805b488999..452c88b8ad0691cc4d22ae4289d9f0cfa6b5c332 100644 (file)
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,7 +13,6 @@
                          X86_EFLAGS_CF | X86_EFLAGS_RF)
  
  void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
  int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                      struct pt_regs *regs, unsigned long mask);
  
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h

index ba665ebd17bb8f22a39712dbf2a8c398b9e9207f..db333300bd4be17205daf3b2e820c0af101ccc2d 100644 (file)
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -15,7 +15,7 @@
  
  #include <linux/stringify.h>
  #include <asm/nops.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  
  /* "Raw" instruction opcodes */
  #define __ASM_CLAC     .byte 0x0f,0x01,0xca
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h

index dfcf0727623b3f89d53374ea504f243e802be149..20a3de5cb3b0dd5e3362833baebd752c1142ae4e 100644 (file)
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,6 @@
  #endif
  #include <asm/thread_info.h>
  #include <asm/cpumask.h>
-#include <asm/cpufeature.h>
  
  extern int smp_num_siblings;
  extern unsigned int num_processors;
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h

index ff8b9a17dc4b2d354971fb52b60b69e6fbb0e903..ca6ba36077054605580e8b8a6236eab7aa65733f 100644 (file)
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
  #define memset(s, c, n) __memset(s, c, n)
  #endif
  
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst:       destination address
+ * @src:       source address
+ * @cnt:       number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
  #endif /* __KERNEL__ */
  
  #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index c7b551028740f18a5360070a6ccb5dc9ad714c5e..82866697fcf186ac7f63f1f6b5f1f77385dc8ab4 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@
   */
  #ifndef __ASSEMBLY__
  struct task_struct;
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <linux/atomic.h>
  
  struct thread_info {
@@ -134,10 +134,13 @@ struct thread_info {
  #define _TIF_ADDR32            (1 << TIF_ADDR32)
  #define _TIF_X32               (1 << TIF_X32)
  
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
  #define _TIF_WORK_SYSCALL_ENTRY        \
         (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |   \
-        _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |     \
+        _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT |       \
          _TIF_NOHZ)
  
  /* work to do on any return to user space */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

index 6df2029405a3ae55df8b9718dd320b55dde5c1ad..0bb31cb8c73b4a3183f3b9c138ff53ddbe9838b8 100644 (file)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,6 +5,7 @@
  #include <linux/sched.h>
  
  #include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/special_insns.h>
  
  #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index 0fb46482dfde160b9dcfad6ef57841e07c3830e2..7f991bd5031b24947e0773265023ab70b934a7b8 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { }
  
  extern const struct cpumask *cpu_coregroup_mask(int cpu);
  
+#define topology_logical_package_id(cpu)       (cpu_data(cpu).logical_proc_id)
  #define topology_physical_package_id(cpu)      (cpu_data(cpu).phys_proc_id)
  #define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
  
  #ifdef ENABLE_TOPO_DEFINES
  #define topology_core_cpumask(cpu)             (per_cpu(cpu_core_map, cpu))
  #define topology_sibling_cpumask(cpu)          (per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages()                        (__max_logical_packages)
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+extern int topology_phys_to_logical_pkg(unsigned int pkg);
+#else
+#define topology_max_packages()                        (1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
  #endif
  
  static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h

index a4a30e4b2d34321d96ddd02413d037d3f6cb95d9..c0f27d7ea7ff95eaea44987bd94417fabe68ca98 100644 (file)
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
         likely(!__range_not_ok(addr, size, user_addr_max()))
  
  /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
   *
   * All the routines below use bits of fixup code that are out of line
   * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
   */
  
  struct exception_table_entry {
-       int insn, fixup;
+       int insn, fixup, handler;
  };
  /* This is not the generic standard exception_table_entry format */
  #define ARCH_HAS_SORT_EXTABLE
  #define ARCH_HAS_SEARCH_EXTABLE
  
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
  extern int early_fixup_exception(unsigned long *ip);
  
  /*
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h

index b89c34c4019b5818da47434dcd711235a65baa40..307698688fa1cfde037e9d9f96426269b7cd61e9 100644 (file)
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,7 +8,7 @@
  #include <linux/errno.h>
  #include <linux/lockdep.h>
  #include <asm/alternative.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/page.h>
  
  /*
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h

index deabaf9759b640d5cd93f50f9db67ef2dc60a807..43dc55be524e7fdc8e5150f87b02a8639b20b3f3 100644 (file)
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -13,9 +13,6 @@ struct vdso_image {
         void *data;
         unsigned long size;   /* Always a multiple of PAGE_SIZE */
  
-       /* text_mapping.pages is big enough for data/size page pointers */
-       struct vm_special_mapping text_mapping;
-
         unsigned long alt, alt_len;
  
         long sym_vvar_start;  /* Negative offset to the vvar area */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h

index f556c4843aa18af74359dfeb2a41d39d9a2c3bb9..e728699db7741f0282441a79635a88afd23259c8 100644 (file)
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -37,6 +37,12 @@ struct vsyscall_gtod_data {
  };
  extern struct vsyscall_gtod_data vsyscall_gtod_data;
  
+extern int vclocks_used;
+static inline bool vclock_was_used(int vclock)
+{
+       return READ_ONCE(vclocks_used) & (1 << vclock);
+}
+
  static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
  {
         unsigned ret;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h

index d485232f1e9f9a1b0f0ba4d44bf8382c39175757..62d4111c1c549a8f1cc70636e708934a83605cde 100644 (file)
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -256,7 +256,7 @@ struct sigcontext_64 {
         __u16                           cs;
         __u16                           gs;
         __u16                           fs;
-       __u16                           __pad0;
+       __u16                           ss;
         __u64                           err;
         __u64                           trapno;
         __u64                           oldmask;
@@ -341,9 +341,37 @@ struct sigcontext {
         __u64                           rip;
         __u64                           eflags;         /* RFLAGS */
         __u16                           cs;
+
+       /*
+        * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+        * Linux saved and restored fs and gs in these slots.  This
+        * was counterproductive, as fsbase and gsbase were never
+        * saved, so arch_prctl was presumably unreliable.
+        *
+        * These slots should never be reused without extreme caution:
+        *
+        *  - Some DOSEMU versions stash fs and gs in these slots manually,
+        *    thus overwriting anything the kernel expects to be preserved
+        *    in these slots.
+        *
+        *  - If these slots are ever needed for any other purpose,
+        *    there is some risk that very old 64-bit binaries could get
+        *    confused.  I doubt that many such binaries still work,
+        *    though, since the same patch in 2.5.64 also removed the
+        *    64-bit set_thread_area syscall, so it appears that there
+        *    is no TLS API beyond modify_ldt that works in both pre-
+        *    and post-2.5.64 kernels.
+        *
+        * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
+        * save/restore, it will most likely need to be opt-in and use
+        * different context slots.
+        */
         __u16                           gs;
         __u16                           fs;
-       __u16                           __pad0;
+       union {
+               __u16                   ss;     /* If UC_SIGCONTEXT_SS */
+               __u16                   __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
+       };
         __u64                           err;
         __u64                           trapno;
         __u64                           oldmask;
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h

index b7c29c8017f280ba4cdf58e08557dd94221991fb..e3d1ec90616ed39fcdc259fae324530c5d0477d5 100644 (file)
--- a/arch/x86/include/uapi/asm/ucontext.h
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -1,11 +1,54 @@
  #ifndef _ASM_X86_UCONTEXT_H
  #define _ASM_X86_UCONTEXT_H
  
-#define UC_FP_XSTATE   0x1     /* indicates the presence of extended state
-                                * information in the memory layout pointed
-                                * by the fpstate pointer in the ucontext's
-                                * sigcontext struct (uc_mcontext).
-                                */
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE   0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ *   with zero or garbage in the SS slot (e.g. old CRIU) and call
+ *   sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ *   context, delete the old SS segment (with modify_ldt), and change
+ *   the saved CS to a 64-bit segment.  These DOSEMU versions expect
+ *   sigreturn to send them back to 64-bit mode without killing them,
+ *   despite the fact that the SS selector when the signal was raised is
+ *   no longer valid.  UC_STRICT_RESTORE_SS will be clear, so the kernel
+ *   will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ *   modifying the saved context will end up in exactly the state they
+ *   started in, even if they were running in a segmented context when
+ *   the signal was raised..  Old kernels would lose track of the
+ *   previous SS value.
+ */
+#define UC_SIGCONTEXT_SS       0x2
+#define UC_STRICT_RESTORE_SS   0x4
+#endif
  
  #include <asm-generic/ucontext.h>
  
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index 8a5cddac7d444084c223e95948944b5099d15003..531b9611c51d5d4b42f721f45609a5f4ecd3cce2 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2077,6 +2077,20 @@ int generic_processor_info(int apicid, int version)
         } else
                 cpu = cpumask_next_zero(-1, cpu_present_mask);
  
+       /*
+        * This can happen on physical hotplug. The sanity check at boot time
+        * is done from native_smp_prepare_cpus() after num_possible_cpus() is
+        * established.
+        */
+       if (topology_update_package_map(apicid, cpu) < 0) {
+               int thiscpu = max + disabled_cpus;
+
+               pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+                          thiscpu, apicid);
+               disabled_cpus++;
+               return -ENOSPC;
+       }
+
         /*
          * Validate version
          */
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c

index c80c02c6ec4944a85715f4438443e5987df4f612..ab5c2c685a3ce908c3677262a35fd402ba77a445 100644 (file)
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
         unsigned long value;
         unsigned int id = (x >> 24) & 0xff;
  
-       if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+       if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
                 rdmsrl(MSR_FAM10H_NODE_ID, value);
                 id |= (value << 2) & 0xff00;
         }
@@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
         this_cpu_write(cpu_llc_id, node);
  
         /* Account for nodes per socket in multi-core-module processors */
-       if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+       if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
                 rdmsrl(MSR_FAM10H_NODE_ID, val);
                 nodes = ((val >> 3) & 7) + 1;
         }
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c

index 84a7524b202cad58509a68db5e6d229f7765b6aa..5c042466f274d0f79dac84bba1c6835953565c84 100644 (file)
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -59,7 +59,6 @@ void common(void) {
  
  #ifdef CONFIG_PARAVIRT
         BLANK();
-       OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
         OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
         OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
         OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c

index 6ce39025f467fb060ce68c8b8ebfaf8f87258d09..ecdc1d217dc0f50a7b760c25cec8bee586643336 100644 (file)
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
  #include <linux/lguest.h>
  #include "../../../drivers/lguest/lg.h"
  
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
  static char syscalls[] = {
  #include <asm/syscalls_32.h>
  };
@@ -52,6 +52,11 @@ void foo(void)
         DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
                offsetofend(struct tss_struct, SYSENTER_stack));
  
+       /* Offset from cpu_tss to SYSENTER_stack */
+       OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+       /* Size of SYSENTER_stack */
+       DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+
  #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
         BLANK();
         OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c

index f2edafb5f24eb2034ee54751292af3fa9bd70fb9..d875f97d4e0ba0477c648e1244ff238bfd48be03 100644 (file)
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,17 +4,11 @@
  
  #include <asm/ia32.h>
  
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
  static char syscalls_64[] = {
  #include <asm/syscalls_64.h>
  };
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
  static char syscalls_ia32[] = {
  #include <asm/syscalls_32.h>
  };
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile

index 58031303e30488c540d609f95d0693918c09fa64..0d373d7affc8d48d1039f6c6d759a3b9cc191c38 100644 (file)
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,33 +30,11 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)               += centaur.o
  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)     += transmeta.o
  obj-$(CONFIG_CPU_SUP_UMC_32)           += umc.o
  
-obj-$(CONFIG_PERF_EVENTS)              += perf_event.o
-
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_amd.o perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_amd_iommu.o
-endif
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_pt.o perf_event_intel_bts.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_cstate.o
-
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
-                                          perf_event_intel_uncore_snb.o \
-                                          perf_event_intel_uncore_snbep.o \
-                                          perf_event_intel_uncore_nhmex.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_msr.o
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_msr.o
-endif
-
-
  obj-$(CONFIG_X86_MCE)                  += mcheck/
  obj-$(CONFIG_MTRR)                     += mtrr/
  obj-$(CONFIG_MICROCODE)                        += microcode/
  
-obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o
  
  obj-$(CONFIG_HYPERVISOR_GUEST)         += vmware.o hypervisor.o mshyperv.o
  
@@ -64,7 +42,7 @@ ifdef CONFIG_X86_FEATURE_NAMES
  quiet_cmd_mkcapflags = MKCAP   $@
        cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
  
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
  
  targets += capflags.c
  $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index a07956a08936e8ea56c1a73075d700d281895577..97c59fd60702f4ceacad7c4d46a3c8d5de7dacbf 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -117,7 +117,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                 void (*f_vide)(void);
                 u64 d, d2;
  
-               printk(KERN_INFO "AMD K6 stepping B detected - ");
+               pr_info("AMD K6 stepping B detected - ");
  
                 /*
                  * It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -133,10 +133,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                 d = d2-d;
  
                 if (d > 20*K6_BUG_LOOP)
-                       printk(KERN_CONT
-                               "system stability may be impaired when more than 32 MB are used.\n");
+                       pr_cont("system stability may be impaired when more than 32 MB are used.\n");
                 else
-                       printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+                       pr_cont("probably OK (after B9730xxxx).\n");
         }
  
         /* K6 with old style WHCR */
@@ -154,7 +153,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                         wbinvd();
                         wrmsr(MSR_K6_WHCR, l, h);
                         local_irq_restore(flags);
-                       printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+                       pr_info("Enabling old style K6 write allocation for %d Mb\n",
                                 mbytes);
                 }
                 return;
@@ -175,7 +174,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                         wbinvd();
                         wrmsr(MSR_K6_WHCR, l, h);
                         local_irq_restore(flags);
-                       printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+                       pr_info("Enabling new style K6 write allocation for %d Mb\n",
                                 mbytes);
                 }
  
@@ -202,7 +201,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
          */
         if (c->x86_model >= 6 && c->x86_model <= 10) {
                 if (!cpu_has(c, X86_FEATURE_XMM)) {
-                       printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+                       pr_info("Enabling disabled K7/SSE Support.\n");
                         msr_clear_bit(MSR_K7_HWCR, 15);
                         set_cpu_cap(c, X86_FEATURE_XMM);
                 }
@@ -216,9 +215,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
         if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
                 rdmsr(MSR_K7_CLK_CTL, l, h);
                 if ((l & 0xfff00000) != 0x20000000) {
-                       printk(KERN_INFO
-                           "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
-                                       l, ((l & 0x000fffff)|0x20000000));
+                       pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+                               l, ((l & 0x000fffff)|0x20000000));
                         wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
                 }
         }
@@ -485,7 +483,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
                 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
                         unsigned long pfn = tseg >> PAGE_SHIFT;
  
-                       printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+                       pr_debug("tseg: %010llx\n", tseg);
                         if (pfn_range_is_mapped(pfn, pfn + 1))
                                 set_memory_4k((unsigned long)__va(tseg), 1);
                 }
@@ -500,8 +498,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
  
                         rdmsrl(MSR_K7_HWCR, val);
                         if (!(val & BIT(24)))
-                               printk(KERN_WARNING FW_BUG "TSC doesn't count "
-                                       "with P0 frequency!\n");
+                               pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
                 }
         }
  
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c

index 04f0fe5af83ec34bb4fd6ec09fd3cc8db1c7ee07..a972ac4c7e7df05238e78b08e093cded47640fcd 100644 (file)
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
  {
         identify_boot_cpu();
  #if !defined(CONFIG_SMP)
-       printk(KERN_INFO "CPU: ");
+       pr_info("CPU: ");
         print_cpu_info(&boot_cpu_data);
  #endif
         alternative_instructions();
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c

index ae20be6e483c77703413cf36e9db065134da01f3..1661d8ec92805191cd4581c365db68ea5acfdf02 100644 (file)
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,7 +1,7 @@
  #include <linux/bitops.h>
  #include <linux/kernel.h>
  
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/e820.h>
  #include <asm/mtrr.h>
  #include <asm/msr.h>
@@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c)
                         rdmsr(MSR_VIA_FCR, lo, hi);
                         lo |= ACE_FCR;          /* enable ACE unit */
                         wrmsr(MSR_VIA_FCR, lo, hi);
-                       printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+                       pr_info("CPU: Enabled ACE h/w crypto\n");
                 }
  
                 /* enable RNG unit, if present and disabled */
@@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c)
                         rdmsr(MSR_VIA_RNG, lo, hi);
                         lo |= RNG_ENABLE;       /* enable RNG unit */
                         wrmsr(MSR_VIA_RNG, lo, hi);
-                       printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+                       pr_info("CPU: Enabled h/w RNG\n");
                 }
  
                 /* store Centaur Extended Feature Flags as
@@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
                         name = "C6";
                         fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
                         fcr_clr = DPDC;
-                       printk(KERN_NOTICE "Disabling bugged TSC.\n");
+                       pr_notice("Disabling bugged TSC.\n");
                         clear_cpu_cap(c, X86_FEATURE_TSC);
                         break;
                 case 8:
@@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c)
                 newlo = (lo|fcr_set) & (~fcr_clr);
  
                 if (newlo != lo) {
-                       printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+                       pr_info("Centaur FCR was 0x%X now 0x%X\n",
                                 lo, newlo);
                         wrmsr(MSR_IDT_FCR1, newlo, hi);
                 } else {
-                       printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+                       pr_info("Centaur FCR is 0x%X\n", lo);
                 }
                 /* Emulate MTRRs using Centaur's MCR. */
                 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 37830de8f60a8f0d8f8da27d409da72ce1f13551..4e8d25d395eeca383248d0ed76cc0fe28d0134d2 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -228,7 +228,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
         lo |= 0x200000;
         wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
  
-       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       pr_notice("CPU serial number disabled.\n");
         clear_cpu_cap(c, X86_FEATURE_PN);
  
         /* Disabling the serial number may affect the cpuid level */
@@ -329,9 +329,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
                 if (!warn)
                         continue;
  
-               printk(KERN_WARNING
-                      "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
-                               x86_cap_flag(df->feature), df->level);
+               pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+                       x86_cap_flag(df->feature), df->level);
         }
  }
  
@@ -510,7 +509,7 @@ void detect_ht(struct cpuinfo_x86 *c)
         smp_num_siblings = (ebx & 0xff0000) >> 16;
  
         if (smp_num_siblings == 1) {
-               printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+               pr_info_once("CPU0: Hyper-Threading is disabled\n");
                 goto out;
         }
  
@@ -531,10 +530,10 @@ void detect_ht(struct cpuinfo_x86 *c)
  
  out:
         if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-                      c->phys_proc_id);
-               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-                      c->cpu_core_id);
+               pr_info("CPU: Physical Processor ID: %d\n",
+                       c->phys_proc_id);
+               pr_info("CPU: Processor Core ID: %d\n",
+                       c->cpu_core_id);
                 printed = 1;
         }
  #endif
@@ -559,9 +558,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c)
                 }
         }
  
-       printk_once(KERN_ERR
-                       "CPU: vendor_id '%s' unknown, using generic init.\n" \
-                       "CPU: Your system may be unstable.\n", v);
+       pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+                   "CPU: Your system may be unstable.\n", v);
  
         c->x86_vendor = X86_VENDOR_UNKNOWN;
         this_cpu = &default_cpu;
@@ -760,7 +758,7 @@ void __init early_cpu_init(void)
         int count = 0;
  
  #ifdef CONFIG_PROCESSOR_SELECT
-       printk(KERN_INFO "KERNEL supported cpus:\n");
+       pr_info("KERNEL supported cpus:\n");
  #endif
  
         for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
@@ -778,7 +776,7 @@ void __init early_cpu_init(void)
                         for (j = 0; j < 2; j++) {
                                 if (!cpudev->c_ident[j])
                                         continue;
-                               printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+                               pr_info("  %s %s\n", cpudev->c_vendor,
                                         cpudev->c_ident[j]);
                         }
                 }
@@ -802,6 +800,31 @@ static void detect_nopl(struct cpuinfo_x86 *c)
         clear_cpu_cap(c, X86_FEATURE_NOPL);
  #else
         set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
+
+       /*
+        * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
+        * systems that run Linux at CPL > 0 may or may not have the
+        * issue, but, even if they have the issue, there's absolutely
+        * nothing we can do about it because we can't use the real IRET
+        * instruction.
+        *
+        * NB: For the time being, only 32-bit kernels support
+        * X86_BUG_ESPFIX as such.  64-bit kernels directly choose
+        * whether to apply espfix using paravirt hooks.  If any
+        * non-paravirt system ever shows up that does *not* have the
+        * ESPFIX issue, we can change this.
+        */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_PARAVIRT
+       do {
+               extern void native_iret(void);
+               if (pv_cpu_ops.iret == native_iret)
+                       set_cpu_bug(c, X86_BUG_ESPFIX);
+       } while (0);
+#else
+       set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
  #endif
  }
  
@@ -977,6 +1000,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
  #ifdef CONFIG_NUMA
         numa_add_cpu(smp_processor_id());
  #endif
+       /* The boot/hotplug time assigment got cleared, restore it */
+       c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
  }
  
  /*
@@ -1061,7 +1086,7 @@ static void __print_cpu_msr(void)
                 for (index = index_min; index < index_max; index++) {
                         if (rdmsrl_safe(index, &val))
                                 continue;
-                       printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+                       pr_info(" MSR%08x: %016llx\n", index, val);
                 }
         }
  }
@@ -1100,19 +1125,19 @@ void print_cpu_info(struct cpuinfo_x86 *c)
         }
  
         if (vendor && !strstr(c->x86_model_id, vendor))
-               printk(KERN_CONT "%s ", vendor);
+               pr_cont("%s ", vendor);
  
         if (c->x86_model_id[0])
-               printk(KERN_CONT "%s", c->x86_model_id);
+               pr_cont("%s", c->x86_model_id);
         else
-               printk(KERN_CONT "%d86", c->x86);
+               pr_cont("%d86", c->x86);
  
-       printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+       pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
  
         if (c->x86_mask || c->cpuid_level >= 0)
-               printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
+               pr_cont(", stepping: 0x%x)\n", c->x86_mask);
         else
-               printk(KERN_CONT ")\n");
+               pr_cont(")\n");
  
         print_cpu_msr(c);
  }
@@ -1438,7 +1463,7 @@ void cpu_init(void)
  
         show_ucode_info_early();
  
-       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+       pr_info("Initializing CPU#%d\n", cpu);
  
         if (cpu_feature_enabled(X86_FEATURE_VME) ||
             cpu_has_tsc ||
@@ -1475,20 +1500,6 @@ void cpu_init(void)
  }
  #endif
  
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-void warn_pre_alternatives(void)
-{
-       WARN(1, "You're using static_cpu_has before alternatives have run!\n");
-}
-EXPORT_SYMBOL_GPL(warn_pre_alternatives);
-#endif
-
-inline bool __static_cpu_has_safe(u16 bit)
-{
-       return boot_cpu_has(bit);
-}
-EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
-
  static void bsp_resume(void)
  {
         if (this_cpu->c_bsp_resume)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c

index aaf152e79637384781d264afbff76a58da9489cc..6adef9cac23ee99c96924e2789abeb2d1ad123f3 100644 (file)
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -8,6 +8,7 @@
  #include <linux/timer.h>
  #include <asm/pci-direct.h>
  #include <asm/tsc.h>
+#include <asm/cpufeature.h>
  
  #include "cpu.h"
  
@@ -103,7 +104,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c)
                 local_irq_restore(flags);
  
                 if (ccr5 & 2) { /* possible wrong calibration done */
-                       printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+                       pr_info("Recalibrating delay loop with SLOP bit reset\n");
                         calibrate_delay();
                         c->loops_per_jiffy = loops_per_jiffy;
                 }
@@ -115,7 +116,7 @@ static void set_cx86_reorder(void)
  {
         u8 ccr3;
  
-       printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+       pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
         ccr3 = getCx86(CX86_CCR3);
         setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
  
@@ -128,7 +129,7 @@ static void set_cx86_reorder(void)
  
  static void set_cx86_memwb(void)
  {
-       printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+       pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
  
         /* CCR2 bit 2: unlock NW bit */
         setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
@@ -268,7 +269,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
                  *  VSA1 we work around however.
                  */
  
-               printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+               pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
                 isa_dma_bridge_buggy = 2;
  
                 /* We do this before the PCI layer is running. However we
@@ -426,7 +427,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
                 if (dir0 == 5 || dir0 == 3) {
                         unsigned char ccr3;
                         unsigned long flags;
-                       printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+                       pr_info("Enabling CPUID on Cyrix processor.\n");
                         local_irq_save(flags);
                         ccr3 = getCx86(CX86_CCR3);
                         /* enable MAPEN  */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c

index d820d8eae96be0b3daa0ec01d9f4a22bf1ad930e..73d391ae452f82a7bfeca4be2276eed628dcd452 100644 (file)
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -56,7 +56,7 @@ detect_hypervisor_vendor(void)
         }
  
         if (max_pri)
-               printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
+               pr_info("Hypervisor detected: %s\n", x86_hyper->name);
  }
  
  void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 565648bc1a0aef6c3cf60da92ec9fb60a2408c90..1f7fdb91a818bc10d4b975a070d6b2c1a947b15b 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -8,7 +8,7 @@
  #include <linux/module.h>
  #include <linux/uaccess.h>
  
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/pgtable.h>
  #include <asm/msr.h>
  #include <asm/bugs.h>
@@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
          */
         if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
             c->microcode < 0x20e) {
-               printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+               pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
                 clear_cpu_cap(c, X86_FEATURE_PSE);
         }
  
@@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
         if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
                 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                 if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
-                       printk(KERN_INFO "Disabled fast string operations\n");
+                       pr_info("Disabled fast string operations\n");
                         setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
                         setup_clear_cpu_cap(X86_FEATURE_ERMS);
                 }
@@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                 pr_info("Disabling PGE capability bit\n");
                 setup_clear_cpu_cap(X86_FEATURE_PGE);
         }
+
+       if (c->cpuid_level >= 0x00000001) {
+               u32 eax, ebx, ecx, edx;
+
+               cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+               /*
+                * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+                * apicids which are reserved per package. Store the resulting
+                * shift value for the package management code.
+                */
+               if (edx & (1U << 28))
+                       c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+       }
  }
  
  #ifdef CONFIG_X86_32
@@ -176,7 +189,7 @@ int ppro_with_ram_bug(void)
             boot_cpu_data.x86 == 6 &&
             boot_cpu_data.x86_model == 1 &&
             boot_cpu_data.x86_mask < 8) {
-               printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+               pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
                 return 1;
         }
         return 0;
@@ -225,7 +238,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
  
                 set_cpu_bug(c, X86_BUG_F00F);
                 if (!f00f_workaround_enabled) {
-                       printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+                       pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
                         f00f_workaround_enabled = 1;
                 }
         }
@@ -244,7 +257,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
          * Forcefully enable PAE if kernel parameter "forcepae" is present.
          */
         if (forcepae) {
-               printk(KERN_WARNING "PAE forced!\n");
+               pr_warn("PAE forced!\n");
                 set_cpu_cap(c, X86_FEATURE_PAE);
                 add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
         }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c

index 0b6c52388cf484f809a7f328f475424dd262417c..de6626c18e427b771ea902451ae77ab52e233915 100644 (file)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -14,7 +14,7 @@
  #include <linux/sysfs.h>
  #include <linux/pci.h>
  
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/amd_nb.h>
  #include <asm/smp.h>
  
@@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
         err = amd_set_l3_disable_slot(nb, cpu, slot, val);
         if (err) {
                 if (err == -EEXIST)
-                       pr_warning("L3 slot %d in use/index already disabled!\n",
+                       pr_warn("L3 slot %d in use/index already disabled!\n",
                                    slot);
                 return err;
         }
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h

deleted file mode 100644 (file)

index 336878a..0000000
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#ifndef __INTEL_PT_H__
-#define __INTEL_PT_H__
-
-/*
- * Single-entry ToPA: when this close to region boundary, switch
- * buffers to avoid losing data.
- */
-#define TOPA_PMI_MARGIN 512
-
-#define TOPA_SHIFT 12
-
-static inline unsigned int sizes(unsigned int tsz)
-{
-       return 1 << (tsz + TOPA_SHIFT);
-};
-
-struct topa_entry {
-       u64     end     : 1;
-       u64     rsvd0   : 1;
-       u64     intr    : 1;
-       u64     rsvd1   : 1;
-       u64     stop    : 1;
-       u64     rsvd2   : 1;
-       u64     size    : 4;
-       u64     rsvd3   : 2;
-       u64     base    : 36;
-       u64     rsvd4   : 16;
-};
-
-#define PT_CPUID_LEAVES                2
-#define PT_CPUID_REGS_NUM      4 /* number of regsters (eax, ebx, ecx, edx) */
-
-enum pt_capabilities {
-       PT_CAP_max_subleaf = 0,
-       PT_CAP_cr3_filtering,
-       PT_CAP_psb_cyc,
-       PT_CAP_mtc,
-       PT_CAP_topa_output,
-       PT_CAP_topa_multiple_entries,
-       PT_CAP_single_range_output,
-       PT_CAP_payloads_lip,
-       PT_CAP_mtc_periods,
-       PT_CAP_cycle_thresholds,
-       PT_CAP_psb_periods,
-};
-
-struct pt_pmu {
-       struct pmu              pmu;
-       u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
-};
-
-/**
- * struct pt_buffer - buffer configuration; one buffer per task_struct or
- *             cpu, depending on perf event configuration
- * @cpu:       cpu for per-cpu allocation
- * @tables:    list of ToPA tables in this buffer
- * @first:     shorthand for first topa table
- * @last:      shorthand for last topa table
- * @cur:       current topa table
- * @nr_pages:  buffer size in pages
- * @cur_idx:   current output region's index within @cur table
- * @output_off:        offset within the current output region
- * @data_size: running total of the amount of data in this buffer
- * @lost:      if data was lost/truncated
- * @head:      logical write offset inside the buffer
- * @snapshot:  if this is for a snapshot/overwrite counter
- * @stop_pos:  STOP topa entry in the buffer
- * @intr_pos:  INT topa entry in the buffer
- * @data_pages:        array of pages from perf
- * @topa_index:        table of topa entries indexed by page offset
- */
-struct pt_buffer {
-       int                     cpu;
-       struct list_head        tables;
-       struct topa             *first, *last, *cur;
-       unsigned int            cur_idx;
-       size_t                  output_off;
-       unsigned long           nr_pages;
-       local_t                 data_size;
-       local_t                 lost;
-       local64_t               head;
-       bool                    snapshot;
-       unsigned long           stop_pos, intr_pos;
-       void                    **data_pages;
-       struct topa_entry       *topa_index[0];
-};
-
-/**
- * struct pt - per-cpu pt context
- * @handle:    perf output handle
- * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
- */
-struct pt {
-       struct perf_output_handle handle;
-       int                     handle_nmi;
-};
-
-#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c

index afa9f0d487ea07b79936fcd5b62040540102fad3..fbb5e90557a5257bbde11179cdeb415e60485019 100644 (file)
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -1,5 +1,5 @@
  #include <asm/cpu_device_id.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <linux/cpu.h>
  #include <linux/module.h>
  #include <linux/slab.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c

index 4cfba4371a71f28c4615610475e3eec9e8d9790c..517619ea6498b41839f87f28bce4a76b1e43c7ab 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -115,7 +115,7 @@ static int raise_local(void)
         int cpu = m->extcpu;
  
         if (m->inject_flags & MCJ_EXCEPTION) {
-               printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+               pr_info("Triggering MCE exception on CPU %d\n", cpu);
                 switch (context) {
                 case MCJ_CTX_IRQ:
                         /*
@@ -128,15 +128,15 @@ static int raise_local(void)
                         raise_exception(m, NULL);
                         break;
                 default:
-                       printk(KERN_INFO "Invalid MCE context\n");
+                       pr_info("Invalid MCE context\n");
                         ret = -EINVAL;
                 }
-               printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+               pr_info("MCE exception done on CPU %d\n", cpu);
         } else if (m->status) {
-               printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+               pr_info("Starting machine check poll CPU %d\n", cpu);
                 raise_poll(m);
                 mce_notify_irq();
-               printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+               pr_info("Machine check poll done on CPU %d\n", cpu);
         } else
                 m->finished = 0;
  
@@ -183,8 +183,7 @@ static void raise_mce(struct mce *m)
                 start = jiffies;
                 while (!cpumask_empty(mce_inject_cpumask)) {
                         if (!time_before(jiffies, start + 2*HZ)) {
-                               printk(KERN_ERR
-                               "Timeout waiting for mce inject %lx\n",
+                               pr_err("Timeout waiting for mce inject %lx\n",
                                         *cpumask_bits(mce_inject_cpumask));
                                 break;
                         }
@@ -241,7 +240,7 @@ static int inject_init(void)
  {
         if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
                 return -ENOMEM;
-       printk(KERN_INFO "Machine check injector initialized\n");
+       pr_info("Machine check injector initialized\n");
         register_mce_write_callback(mce_write);
         register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
                                 "mce_notify");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c

index 9c682c222071db1960ba848c24c76567306d0542..5119766d988925a4c8eb9df23a3ff7b1b626d174 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
  #include <linux/init.h>
  #include <linux/debugfs.h>
  #include <asm/mce.h>
+#include <asm/uaccess.h>
  
  #include "mce-internal.h"
  
@@ -29,7 +30,7 @@
   * panic situations)
   */
  
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
  enum ser { SER_REQUIRED = 1, NO_SER = 2 };
  enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
  
@@ -48,6 +49,7 @@ static struct severity {
  #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
  #define  KERNEL                .context = IN_KERNEL
  #define  USER          .context = IN_USER
+#define  KERNEL_RECOV  .context = IN_KERNEL_RECOV
  #define  SER           .ser = SER_REQUIRED
  #define  NOSER         .ser = NO_SER
  #define  EXCP          .excp = EXCP_CONTEXT
@@ -86,6 +88,10 @@ static struct severity {
                 PANIC, "In kernel and no restart IP",
                 EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
                 ),
+       MCESEV(
+               PANIC, "In kernel and no restart IP",
+               EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+               ),
         MCESEV(
                 DEFERRED, "Deferred error",
                 NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
@@ -122,6 +128,11 @@ static struct severity {
                 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
                 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
                 ),
+       MCESEV(
+               AR, "Action required: data load in error recoverable area of kernel",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               KERNEL_RECOV
+               ),
         MCESEV(
                 AR, "Action required: data load error in a user process",
                 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
@@ -170,6 +181,9 @@ static struct severity {
                 )       /* always matches. keep at end */
  };
  
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+                               (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
  /*
   * If mcgstatus indicated that ip/cs on the stack were
   * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
   */
  static int error_context(struct mce *m)
  {
-       return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+       if ((m->cs & 3) == 3)
+               return IN_USER;
+       if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+               return IN_KERNEL_RECOV;
+       return IN_KERNEL;
  }
  
  /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index a006f4cd792b10d54a92eff4a3b9c6860aa5382e..f0c921b03e4245e1f7a4b16687c0d9e7600b4256 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
         }
  }
  
+static int do_memory_failure(struct mce *m)
+{
+       int flags = MF_ACTION_REQUIRED;
+       int ret;
+
+       pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+       if (!(m->mcgstatus & MCG_STATUS_RIPV))
+               flags |= MF_MUST_KILL;
+       ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+       if (ret)
+               pr_err("Memory error not recovered");
+       return ret;
+}
+
  /*
   * The actual machine check handler. This only handles real
   * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
         char *msg = "Unknown";
-       u64 recover_paddr = ~0ull;
-       int flags = MF_ACTION_REQUIRED;
         int lmce = 0;
  
         /* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         }
  
         /*
-        * At insane "tolerant" levels we take no action. Otherwise
-        * we only die if we have no other choice. For less serious
-        * issues we try to recover, or limit damage to the current
-        * process.
+        * If tolerant is at an insane level we drop requests to kill
+        * processes and continue even when there is no way out.
          */
-       if (cfg->tolerant < 3) {
-               if (no_way_out)
-                       mce_panic("Fatal machine check on current CPU", &m, msg);
-               if (worst == MCE_AR_SEVERITY) {
-                       recover_paddr = m.addr;
-                       if (!(m.mcgstatus & MCG_STATUS_RIPV))
-                               flags |= MF_MUST_KILL;
-               } else if (kill_it) {
-                       force_sig(SIGBUS, current);
-               }
-       }
+       if (cfg->tolerant == 3)
+               kill_it = 0;
+       else if (no_way_out)
+               mce_panic("Fatal machine check on current CPU", &m, msg);
  
         if (worst > 0)
                 mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
  out:
         sync_core();
  
-       if (recover_paddr == ~0ull)
-               goto done;
+       if (worst != MCE_AR_SEVERITY && !kill_it)
+               goto out_ist;
  
-       pr_err("Uncorrected hardware memory error in user-access at %llx",
-                recover_paddr);
-       /*
-        * We must call memory_failure() here even if the current process is
-        * doomed. We still need to mark the page as poisoned and alert any
-        * other users of the page.
-        */
-       ist_begin_non_atomic(regs);
-       local_irq_enable();
-       if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-               pr_err("Memory error not recovered");
-               force_sig(SIGBUS, current);
+       /* Fault was in user mode and we need to take some action */
+       if ((m.cs & 3) == 3) {
+               ist_begin_non_atomic(regs);
+               local_irq_enable();
+
+               if (kill_it || do_memory_failure(&m))
+                       force_sig(SIGBUS, current);
+               local_irq_disable();
+               ist_end_non_atomic();
+       } else {
+               if (!fixup_exception(regs, X86_TRAP_MC))
+                       mce_panic("Failed kernel mode recovery", &m, NULL);
         }
-       local_irq_disable();
-       ist_end_non_atomic();
-done:
+
+out_ist:
         ist_exit(regs);
  }
  EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1576,6 +1578,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
  
                 if (c->x86 == 6 && c->x86_model == 45)
                         quirk_no_way_out = quirk_sandybridge_ifu;
+               /*
+                * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+                * whether this processor will actually generate recoverable
+                * machine checks. Check to see if this is an E7 model Xeon.
+                * We can't do a model number check because E5 and E7 use the
+                * same model number. E5 doesn't support recovery, E7 does.
+                */
+               if (mca_cfg.recovery || (mca_cfg.ser &&
+                       !strncmp(c->x86_model_id,
+                                "Intel(R) Xeon(R) CPU E7-", 24)))
+                       set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
         }
         if (cfg->monarch_timeout < 0)
                 cfg->monarch_timeout = 0;
@@ -1617,10 +1630,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
         case X86_VENDOR_AMD: {
                 u32 ebx = cpuid_ebx(0x80000007);
  
-               mce_amd_feature_init(c);
                 mce_flags.overflow_recov = !!(ebx & BIT(0));
                 mce_flags.succor         = !!(ebx & BIT(1));
                 mce_flags.smca           = !!(ebx & BIT(3));
+               mce_amd_feature_init(c);
  
                 break;
                 }
@@ -2028,6 +2041,8 @@ static int __init mcheck_enable(char *str)
                 cfg->bootlog = (str[0] == 'b');
         else if (!strcmp(str, "bios_cmci_threshold"))
                 cfg->bios_cmci_threshold = true;
+       else if (!strcmp(str, "recovery"))
+               cfg->recovery = true;
         else if (isdigit(str[0])) {
                 if (get_option(&str, &cfg->tolerant) == 2)
                         get_option(&str, &(cfg->monarch_timeout));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c

index e99b15077e9464b9c9f337873ae58101285e3215..9d656fd436efd6c90ad148a3cfbbd673248000f5 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
  /*
- *  (c) 2005-2015 Advanced Micro Devices, Inc.
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
   *  Your use of this code is subject to the terms and conditions of the
   *  GNU general public license version 2. See "COPYING" or
   *  http://www.gnu.org/licenses/gpl.html
@@ -28,7 +28,7 @@
  #include <asm/msr.h>
  #include <asm/trace/irq_vectors.h>
  
-#define NR_BLOCKS         9
+#define NR_BLOCKS         5
  #define THRESHOLD_MAX     0xFFF
  #define INT_TYPE_APIC     0x00020000
  #define MASK_VALID_HI     0x80000000
@@ -49,6 +49,19 @@
  #define DEF_LVT_OFF            0x2
  #define DEF_INT_TYPE_APIC      0x2
  
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF       0xF000
+
+/*
+ * OS is required to set the MCAX bit to acknowledge that it is now using the
+ * new MSR ranges and new registers under each bank. It also means that the OS
+ * will configure deferred errors in the new MCx_CONFIG register. If the bit is
+ * not set, uncorrectable errors will cause a system panic.
+ */
+#define SMCA_MCAX_EN_OFF       0x1
+
  static const char * const th_names[] = {
         "load_store",
         "insn_fetch",
@@ -58,6 +71,35 @@ static const char * const th_names[] = {
         "execution_unit",
  };
  
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+       [SMCA_F17H_CORE]        = { "f17h_core",        0xB0 },
+       [SMCA_DF]               = { "data_fabric",      0x2E },
+       [SMCA_UMC]              = { "umc",              0x96 },
+       [SMCA_PB]               = { "param_block",      0x5 },
+       [SMCA_PSP]              = { "psp",              0xFF },
+       [SMCA_SMU]              = { "smu",              0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+       [SMCA_LS]               = "load_store",
+       [SMCA_IF]               = "insn_fetch",
+       [SMCA_L2_CACHE]         = "l2_cache",
+       [SMCA_DE]               = "decode_unit",
+       [RES]                   = "",
+       [SMCA_EX]               = "execution_unit",
+       [SMCA_FP]               = "floating_point",
+       [SMCA_L3_CACHE]         = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+       [SMCA_CS]               = "coherent_slave",
+       [SMCA_PIE]              = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
  static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
  static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
  
@@ -84,6 +126,13 @@ struct thresh_restart {
  
  static inline bool is_shared_bank(int bank)
  {
+       /*
+        * Scalable MCA provides for only one core to have access to the MSRs of
+        * a shared bank.
+        */
+       if (mce_flags.smca)
+               return false;
+
         /* Bank 4 is for northbridge reporting and is thus shared */
         return (bank == 4);
  }
@@ -135,6 +184,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
         }
  
         if (apic != msr) {
+               /*
+                * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+                * the BIOS provides the value. The original field where LVT offset
+                * was set is reserved. Return early here:
+                */
+               if (mce_flags.smca)
+                       return 0;
+
                 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
                        "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
                        b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -144,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
         return 1;
  };
  
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
  static void threshold_restart_bank(void *_tr)
  {
         struct thresh_restart *tr = _tr;
@@ -247,27 +301,116 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
         wrmsr(MSR_CU_DEF_ERR, low, high);
  }
  
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+                            unsigned int bank, unsigned int block)
+{
+       u32 addr = 0, offset = 0;
+
+       if (mce_flags.smca) {
+               if (!block) {
+                       addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+               } else {
+                       /*
+                        * For SMCA enabled processors, BLKPTR field of the
+                        * first MISC register (MCx_MISC0) indicates presence of
+                        * additional MISC register set (MISC1-4).
+                        */
+                       u32 low, high;
+
+                       if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+                               return addr;
+
+                       if (!(low & MCI_CONFIG_MCAX))
+                               return addr;
+
+                       if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+                           (low & MASK_BLKPTR_LO))
+                               addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+               }
+               return addr;
+       }
+
+       /* Fall back to method we used for older processors: */
+       switch (block) {
+       case 0:
+               addr = MSR_IA32_MCx_MISC(bank);
+               break;
+       case 1:
+               offset = ((low & MASK_BLKPTR_LO) >> 21);
+               if (offset)
+                       addr = MCG_XBLK_ADDR + offset;
+               break;
+       default:
+               addr = ++current_addr;
+       }
+       return addr;
+}
+
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+                       int offset, u32 misc_high)
+{
+       unsigned int cpu = smp_processor_id();
+       struct threshold_block b;
+       int new;
+
+       if (!block)
+               per_cpu(bank_map, cpu) |= (1 << bank);
+
+       memset(&b, 0, sizeof(b));
+       b.cpu                   = cpu;
+       b.bank                  = bank;
+       b.block                 = block;
+       b.address               = addr;
+       b.interrupt_capable     = lvt_interrupt_supported(bank, misc_high);
+
+       if (!b.interrupt_capable)
+               goto done;
+
+       b.interrupt_enable = 1;
+
+       if (mce_flags.smca) {
+               u32 smca_low, smca_high;
+               u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+               if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+                       smca_high |= SMCA_MCAX_EN_OFF;
+                       wrmsr(smca_addr, smca_low, smca_high);
+               }
+
+               /* Gather LVT offset for thresholding: */
+               if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+                       goto out;
+
+               new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+       } else {
+               new = (misc_high & MASK_LVTOFF_HI) >> 20;
+       }
+
+       offset = setup_APIC_mce_threshold(offset, new);
+
+       if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
+               mce_threshold_vector = amd_threshold_interrupt;
+
+done:
+       mce_threshold_block_init(&b, offset);
+
+out:
+       return offset;
+}
+
  /* cpu init entry point, called from mce.c with preempt off */
  void mce_amd_feature_init(struct cpuinfo_x86 *c)
  {
-       struct threshold_block b;
-       unsigned int cpu = smp_processor_id();
         u32 low = 0, high = 0, address = 0;
         unsigned int bank, block;
-       int offset = -1, new;
+       int offset = -1;
  
         for (bank = 0; bank < mca_cfg.banks; ++bank) {
                 for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0)
-                               address = MSR_IA32_MCx_MISC(bank);
-                       else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-
-                               address += MCG_XBLK_ADDR;
-                       } else
-                               ++address;
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
  
                         if (rdmsr_safe(address, &low, &high))
                                 break;
@@ -279,29 +422,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                              (high & MASK_LOCKED_HI))
                                 continue;
  
-                       if (!block)
-                               per_cpu(bank_map, cpu) |= (1 << bank);
-
-                       memset(&b, 0, sizeof(b));
-                       b.cpu                   = cpu;
-                       b.bank                  = bank;
-                       b.block                 = block;
-                       b.address               = address;
-                       b.interrupt_capable     = lvt_interrupt_supported(bank, high);
-
-                       if (!b.interrupt_capable)
-                               goto init;
-
-                       b.interrupt_enable = 1;
-                       new     = (high & MASK_LVTOFF_HI) >> 20;
-                       offset  = setup_APIC_mce_threshold(offset, new);
-
-                       if ((offset == new) &&
-                           (mce_threshold_vector != amd_threshold_interrupt))
-                               mce_threshold_vector = amd_threshold_interrupt;
-
-init:
-                       mce_threshold_block_init(&b, offset);
+                       offset = prepare_threshold_block(bank, block, address, offset, high);
                 }
         }
  
@@ -394,16 +515,9 @@ static void amd_threshold_interrupt(void)
                 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
                         continue;
                 for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0) {
-                               address = MSR_IA32_MCx_MISC(bank);
-                       } else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-                               address += MCG_XBLK_ADDR;
-                       } else {
-                               ++address;
-                       }
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
  
                         if (rdmsr_safe(address, &low, &high))
                                 break;
@@ -623,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
         if (err)
                 goto out_free;
  recurse:
-       if (!block) {
-               address = (low & MASK_BLKPTR_LO) >> 21;
-               if (!address)
-                       return 0;
-               address += MCG_XBLK_ADDR;
-       } else {
-               ++address;
-       }
+       address = get_block_address(address, low, high, bank, ++block);
+       if (!address)
+               return 0;
  
-       err = allocate_threshold_blocks(cpu, bank, ++block, address);
+       err = allocate_threshold_blocks(cpu, bank, block, address);
         if (err)
                 goto out_free;
  
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c

index 12402e10aeffda428821dd6dcb88597203e48054..2a0717bf803372d3968dca9d2b2e6c81740d946d 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
         rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
         rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
  
-       printk(KERN_EMERG
-               "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
-               smp_processor_id(), loaddr, lotype);
+       pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+                smp_processor_id(), loaddr, lotype);
  
         if (lotype & (1<<5)) {
-               printk(KERN_EMERG
-                       "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
-                       smp_processor_id());
+               pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+                        smp_processor_id());
         }
  
         add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
         /* Read registers before enabling: */
         rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
         rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-       printk(KERN_INFO
-              "Intel old style machine check architecture supported.\n");
+       pr_info("Intel old style machine check architecture supported.\n");
  
         /* Enable MCE: */
         cr4_set_bits(X86_CR4_MCE);
-       printk(KERN_INFO
-              "Intel old style machine check reporting enabled on CPU#%d.\n",
-              smp_processor_id());
+       pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+               smp_processor_id());
  }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c

index 2c5aaf8c2e2f3dcc94d348dfe91da6d7d5000ae9..0b445c2ff735d44fedc469fd8ce0c1b8b1f1633b 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
         /* if we just entered the thermal event */
         if (new_event) {
                 if (event == THERMAL_THROTTLING_EVENT)
-                       printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+                       pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
                                 this_cpu,
                                 level == CORE_LEVEL ? "Core" : "Package",
                                 state->count);
@@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level)
         }
         if (old_event) {
                 if (event == THERMAL_THROTTLING_EVENT)
-                       printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-                               this_cpu,
+                       pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
                                 level == CORE_LEVEL ? "Core" : "Package");
                 return 1;
         }
@@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void)
  
  static void unexpected_thermal_interrupt(void)
  {
-       printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
-                       smp_processor_id());
+       pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+               smp_processor_id());
  }
  
  static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
  
         if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
                 if (system_state == SYSTEM_BOOTING)
-                       printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+                       pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
                 return;
         }
  
@@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
         l = apic_read(APIC_LVTTHMR);
         apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
  
-       printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
-                      tm2 ? "TM2" : "TM1");
+       pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+                     tm2 ? "TM2" : "TM1");
  
         /* enable thermal throttle processing */
         atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c

index 7245980186eea047e643af5010e1509bfc991632..fcf9ae9384f4cb4693d67cc8ee6433562dfc9f34 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -12,8 +12,8 @@
  
  static void default_threshold_interrupt(void)
  {
-       printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
-                        THRESHOLD_APIC_VECTOR);
+       pr_err("Unexpected threshold interrupt at vector %x\n",
+               THRESHOLD_APIC_VECTOR);
  }
  
  void (*mce_threshold_vector)(void) = default_threshold_interrupt;
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c

index 01dd8702880b7f2db9fcdc3874ba8809c2ce9b80..c6a722e1d011458fa30ec5ad3ad4e78c58d2fa82 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
  {
         ist_enter(regs);
  
-       printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+       pr_emerg("CPU0: Machine Check Exception.\n");
         add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
  
         ist_exit(regs);
@@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
  
         cr4_set_bits(X86_CR4_MCE);
  
-       printk(KERN_INFO
-              "Winchip machine check reporting enabled on CPU#0.\n");
+       pr_info("Winchip machine check reporting enabled on CPU#0.\n");
  }
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c

index 2233f8a766156891a52b9a7658f04efeaf4f86c8..75d3aab5f7b243623ca311c4c23bf07ef2e26ca4 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -953,7 +953,7 @@ struct microcode_ops * __init init_amd_microcode(void)
         struct cpuinfo_x86 *c = &boot_cpu_data;
  
         if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-               pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+               pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
                 return NULL;
         }
  
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh

index 3f20710a5b23b7f7456e99bb7f654ce703bb6abf..6988c74409a825313a2711be6914a20369d223e0 100644 (file)
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,6 +1,6 @@
  #!/bin/sh
  #
-# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
  #
  
  IN=$1
@@ -49,8 +49,8 @@ dump_array()
  trap 'rm "$OUT"' EXIT
  
  (
-       echo "#ifndef _ASM_X86_CPUFEATURE_H"
-       echo "#include <asm/cpufeature.h>"
+       echo "#ifndef _ASM_X86_CPUFEATURES_H"
+       echo "#include <asm/cpufeatures.h>"
         echo "#endif"
         echo ""
  
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c

index 20e242ea1bc46b5f5828c7b95071d920853b7609..4e7c6933691cc8de42aa4a82473482127bd0faab 100644 (file)
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void)
         ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
         ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
  
-       printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
-              ms_hyperv.features, ms_hyperv.hints);
+       pr_info("HyperV: features 0x%x, hints 0x%x\n",
+               ms_hyperv.features, ms_hyperv.hints);
  
  #ifdef CONFIG_X86_LOCAL_APIC
         if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
@@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void)
                 rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
                 hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
                 lapic_timer_frequency = hv_lapic_frequency;
-               printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
-                               lapic_timer_frequency);
+               pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+                       lapic_timer_frequency);
         }
  #endif
  
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c

index 316fe3e60a9764e479d4e49d01346c7921763ee8..3d689937fc1b13974a0a8a5edd31ddda0f6401b2 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
          */
         if (type != MTRR_TYPE_WRCOMB &&
             (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
-               pr_warning("mtrr: only write-combining%s supported\n",
+               pr_warn("mtrr: only write-combining%s supported\n",
                            centaur_mcr_type ? " and uncacheable are" : " is");
                 return -EINVAL;
         }
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c

index 0d98503c2245aab81283526c062f746650b99c32..31e951ce6dff33782c9610c7b43898181d59d637 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -57,9 +57,9 @@ static int __initdata                         nr_range;
  static struct var_mtrr_range_state __initdata  range_state[RANGE_NUM];
  
  static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
  
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
         "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
  
  static int __init
@@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                                                 base, base + size);
         }
         if (debug_print) {
-               printk(KERN_DEBUG "After WB checking\n");
+               pr_debug("After WB checking\n");
                 for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                  range[i].start, range[i].end);
         }
  
@@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                     (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
                     (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
                         /* Var MTRR contains UC entry below 1M? Skip it: */
-                       printk(BIOS_BUG_MSG, i);
+                       pr_warn(BIOS_BUG_MSG, i);
                         if (base + size <= (1<<(20-PAGE_SHIFT)))
                                 continue;
                         size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                                  extra_remove_base + extra_remove_size);
  
         if  (debug_print) {
-               printk(KERN_DEBUG "After UC checking\n");
+               pr_debug("After UC checking\n");
                 for (i = 0; i < RANGE_NUM; i++) {
                         if (!range[i].end)
                                 continue;
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                  range[i].start, range[i].end);
                 }
         }
@@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
         /* sort the ranges */
         nr_range = clean_sort_range(range, RANGE_NUM);
         if  (debug_print) {
-               printk(KERN_DEBUG "After sorting\n");
+               pr_debug("After sorting\n");
                 for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                  range[i].start, range[i].end);
         }
  
@@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void)
                 start_base = to_size_factor(start_base, &start_factor),
                 type = range_state[i].type;
  
-               printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+               pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
                         i, start_base, start_factor,
                         size_base, size_factor,
                         (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                 return 0;
  
         /* Print original var MTRRs at first, for debugging: */
-       printk(KERN_DEBUG "original variable MTRRs\n");
+       pr_debug("original variable MTRRs\n");
         print_out_mtrr_range_state();
  
         memset(range, 0, sizeof(range));
@@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                                           x_remove_base, x_remove_size);
  
         range_sums = sum_ranges(range, nr_range);
-       printk(KERN_INFO "total RAM covered: %ldM\n",
+       pr_info("total RAM covered: %ldM\n",
                range_sums >> (20 - PAGE_SHIFT));
  
         if (mtrr_chunk_size && mtrr_gran_size) {
@@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits)
  
                 if (!result[i].bad) {
                         set_var_mtrr_all(address_bits);
-                       printk(KERN_DEBUG "New variable MTRRs\n");
+                       pr_debug("New variable MTRRs\n");
                         print_out_mtrr_range_state();
                         return 1;
                 }
-               printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-                      "will find optimal one\n");
+               pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
         }
  
         i = 0;
@@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                                       x_remove_base, x_remove_size, i);
                         if (debug_print) {
                                 mtrr_print_out_one_result(i);
-                               printk(KERN_INFO "\n");
+                               pr_info("\n");
                         }
  
                         i++;
@@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits)
         index_good = mtrr_search_optimal_index();
  
         if (index_good != -1) {
-               printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+               pr_info("Found optimal setting for mtrr clean up\n");
                 i = index_good;
                 mtrr_print_out_one_result(i);
  
@@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                 gran_size <<= 10;
                 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
                 set_var_mtrr_all(address_bits);
-               printk(KERN_DEBUG "New variable MTRRs\n");
+               pr_debug("New variable MTRRs\n");
                 print_out_mtrr_range_state();
                 return 1;
         } else {
@@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits)
                         mtrr_print_out_one_result(i);
         }
  
-       printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-       printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+       pr_info("mtrr_cleanup: can not find optimal value\n");
+       pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
  
         return 0;
  }
@@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
  
         /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
         if (!highest_pfn) {
-               printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+               pr_info("CPU MTRRs all blank - virtualized system.\n");
                 return 0;
         }
  
@@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
                                                          end_pfn);
  
         if (total_trim_size) {
-               pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+               pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+                       total_trim_size >> 20);
  
                 if (!changed_by_mtrr_cleanup)
                         WARN_ON(1);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c

index c870af1610083ec3dda7cb61b966860c9a224374..fcbcb2f678ca47360d23623887d216165969d6e8 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
  
         rdmsr(MSR_K8_SYSCFG, lo, hi);
         if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
-               printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+               pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
                        " not cleared by BIOS, clearing this bit\n",
                        smp_processor_id());
                 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
@@ -501,14 +501,14 @@ void __init mtrr_state_warn(void)
         if (!mask)
                 return;
         if (mask & MTRR_CHANGE_MASK_FIXED)
-               pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
         if (mask & MTRR_CHANGE_MASK_VARIABLE)
-               pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
         if (mask & MTRR_CHANGE_MASK_DEFTYPE)
-               pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
  
-       printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
-       printk(KERN_INFO "mtrr: corrected configuration.\n");
+       pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+       pr_info("mtrr: corrected configuration.\n");
  }
  
  /*
@@ -519,8 +519,7 @@ void __init mtrr_state_warn(void)
  void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
  {
         if (wrmsr_safe(msr, a, b) < 0) {
-               printk(KERN_ERR
-                       "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+               pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
                         smp_processor_id(), msr, a, b);
         }
  }
@@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
                 tmp |= ~((1ULL<<(hi - 1)) - 1);
  
                 if (tmp != mask) {
-                       printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+                       pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
                         add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
                         mask = tmp;
                 }
@@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
             boot_cpu_data.x86_model == 1 &&
             boot_cpu_data.x86_mask <= 7) {
                 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
-                       pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+                       pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
                         return -EINVAL;
                 }
                 if (!(base + size < 0x70000 || base > 0x7003F) &&
                     (type == MTRR_TYPE_WRCOMB
                      || type == MTRR_TYPE_WRBACK)) {
-                       pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+                       pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
                         return -EINVAL;
                 }
         }
@@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
              lbase = lbase >> 1, last = last >> 1)
                 ;
         if (lbase != last) {
-               pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+               pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
                 return -EINVAL;
         }
         return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c

index 5c3d149ee91cb1f6c87ff6ad1853a38adfce82da..10f8d4796240709cea61c0e56522d75eeff8ffdd 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -47,7 +47,7 @@
  #include <linux/smp.h>
  #include <linux/syscore_ops.h>
  
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/e820.h>
  #include <asm/mtrr.h>
  #include <asm/msr.h>
@@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                 return error;
  
         if (type >= MTRR_NUM_TYPES) {
-               pr_warning("mtrr: type: %u invalid\n", type);
+               pr_warn("mtrr: type: %u invalid\n", type);
                 return -EINVAL;
         }
  
         /* If the type is WC, check that this processor supports it */
         if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-               pr_warning("mtrr: your processor doesn't support write-combining\n");
+               pr_warn("mtrr: your processor doesn't support write-combining\n");
                 return -ENOSYS;
         }
  
         if (!size) {
-               pr_warning("mtrr: zero sized request\n");
+               pr_warn("mtrr: zero sized request\n");
                 return -EINVAL;
         }
  
         if ((base | (base + size - 1)) >>
             (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-               pr_warning("mtrr: base or size exceeds the MTRR width\n");
+               pr_warn("mtrr: base or size exceeds the MTRR width\n");
                 return -EINVAL;
         }
  
@@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                                 } else if (types_compatible(type, ltype))
                                         continue;
                         }
-                       pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+                       pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
                                 " 0x%lx000,0x%lx000\n", base, size, lbase,
                                 lsize);
                         goto out;
@@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                 if (ltype != type) {
                         if (types_compatible(type, ltype))
                                 continue;
-                       pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+                       pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
                                 base, size, mtrr_attrib_to_str(ltype),
                                 mtrr_attrib_to_str(type));
                         goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
  static int mtrr_check(unsigned long base, unsigned long size)
  {
         if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-               pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+               pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
                 pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
                 dump_stack();
                 return -1;
@@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
                 }
         }
         if (reg >= max) {
-               pr_warning("mtrr: register: %d too big\n", reg);
+               pr_warn("mtrr: register: %d too big\n", reg);
                 goto out;
         }
         mtrr_if->get(reg, &lbase, &lsize, &ltype);
         if (lsize < 1) {
-               pr_warning("mtrr: MTRR %d not used\n", reg);
+               pr_warn("mtrr: MTRR %d not used\n", reg);
                 goto out;
         }
         if (mtrr_usage_table[reg] < 1) {
-               pr_warning("mtrr: reg: %d has count=0\n", reg);
+               pr_warn("mtrr: reg: %d has count=0\n", reg);
                 goto out;
         }
         if (--mtrr_usage_table[reg] < 1)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

deleted file mode 100644 (file)

index 1b443db..0000000
--- a/arch/x86/kernel/cpu/perf_event.c
+++ /dev/null
@@ -1,2428 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/device.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/alternative.h>
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-#include <asm/ldt.h>
-
-#include "perf_event.h"
-
-struct x86_pmu x86_pmu __read_mostly;
-
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
-       .enabled = 1,
-};
-
-struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
-
-u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-u64 __read_mostly hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-u64 x86_perf_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int shift = 64 - x86_pmu.cntval_bits;
-       u64 prev_raw_count, new_raw_count;
-       int idx = hwc->idx;
-       s64 delta;
-
-       if (idx == INTEL_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * Careful: an NMI might modify the previous event value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic event atomically:
-        */
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               goto again;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-       local64_sub(delta, &hwc->period_left);
-
-       return new_raw_count;
-}
-
-/*
- * Find and validate any extra registers to set up.
- */
-static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-       struct extra_reg *er;
-
-       reg = &event->hw.extra_reg;
-
-       if (!x86_pmu.extra_regs)
-               return 0;
-
-       for (er = x86_pmu.extra_regs; er->msr; er++) {
-               if (er->event != (config & er->config_mask))
-                       continue;
-               if (event->attr.config1 & ~er->valid_mask)
-                       return -EINVAL;
-               /* Check if the extra msrs can be safely accessed*/
-               if (!er->extra_msr_access)
-                       return -ENXIO;
-
-               reg->idx = er->idx;
-               reg->config = event->attr.config1;
-               reg->reg = er->msr;
-               break;
-       }
-       return 0;
-}
-
-static atomic_t active_events;
-static atomic_t pmc_refcount;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
-                       goto perfctr_fail;
-       }
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
-                       goto eventsel_fail;
-       }
-
-       return true;
-
-eventsel_fail:
-       for (i--; i >= 0; i--)
-               release_evntsel_nmi(x86_pmu_config_addr(i));
-
-       i = x86_pmu.num_counters;
-
-perfctr_fail:
-       for (i--; i >= 0; i--)
-               release_perfctr_nmi(x86_pmu_event_addr(i));
-
-       return false;
-}
-
-static void release_pmc_hardware(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               release_perfctr_nmi(x86_pmu_event_addr(i));
-               release_evntsel_nmi(x86_pmu_config_addr(i));
-       }
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static bool check_hw_exists(void)
-{
-       u64 val, val_fail, val_new= ~0;
-       int i, reg, reg_fail, ret = 0;
-       int bios_fail = 0;
-       int reg_safe = -1;
-
-       /*
-        * Check to see if the BIOS enabled any of the counters, if so
-        * complain and bail.
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               reg = x86_pmu_config_addr(i);
-               ret = rdmsrl_safe(reg, &val);
-               if (ret)
-                       goto msr_fail;
-               if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
-                       bios_fail = 1;
-                       val_fail = val;
-                       reg_fail = reg;
-               } else {
-                       reg_safe = i;
-               }
-       }
-
-       if (x86_pmu.num_counters_fixed) {
-               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               ret = rdmsrl_safe(reg, &val);
-               if (ret)
-                       goto msr_fail;
-               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-                       if (val & (0x03 << i*4)) {
-                               bios_fail = 1;
-                               val_fail = val;
-                               reg_fail = reg;
-                       }
-               }
-       }
-
-       /*
-        * If all the counters are enabled, the below test will always
-        * fail.  The tools will also become useless in this scenario.
-        * Just fail and disable the hardware counters.
-        */
-
-       if (reg_safe == -1) {
-               reg = reg_safe;
-               goto msr_fail;
-       }
-
-       /*
-        * Read the current value, change it and read it back to see if it
-        * matches, this is needed to detect certain hardware emulators
-        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-        */
-       reg = x86_pmu_event_addr(reg_safe);
-       if (rdmsrl_safe(reg, &val))
-               goto msr_fail;
-       val ^= 0xffffUL;
-       ret = wrmsrl_safe(reg, val);
-       ret |= rdmsrl_safe(reg, &val_new);
-       if (ret || val != val_new)
-               goto msr_fail;
-
-       /*
-        * We still allow the PMU driver to operate:
-        */
-       if (bios_fail) {
-               printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
-               printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
-       }
-
-       return true;
-
-msr_fail:
-       printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
-       printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
-               boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
-               reg, val_new);
-
-       return false;
-}
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
-       x86_release_hardware();
-       atomic_dec(&active_events);
-}
-
-void hw_perf_lbr_event_destroy(struct perf_event *event)
-{
-       hw_perf_event_destroy(event);
-
-       /* undo the lbr/bts event accounting */
-       x86_del_exclusive(x86_lbr_exclusive_lbr);
-}
-
-static inline int x86_pmu_initialized(void)
-{
-       return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
-{
-       struct perf_event_attr *attr = &event->attr;
-       unsigned int cache_type, cache_op, cache_result;
-       u64 config, val;
-
-       config = attr->config;
-
-       cache_type = (config >>  0) & 0xff;
-       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-               return -EINVAL;
-
-       cache_op = (config >>  8) & 0xff;
-       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-               return -EINVAL;
-
-       cache_result = (config >> 16) & 0xff;
-       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-               return -EINVAL;
-
-       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-       if (val == 0)
-               return -ENOENT;
-
-       if (val == -1)
-               return -EINVAL;
-
-       hwc->config |= val;
-       attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
-       return x86_pmu_extra_regs(val, event);
-}
-
-int x86_reserve_hardware(void)
-{
-       int err = 0;
-
-       if (!atomic_inc_not_zero(&pmc_refcount)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&pmc_refcount) == 0) {
-                       if (!reserve_pmc_hardware())
-                               err = -EBUSY;
-                       else
-                               reserve_ds_buffers();
-               }
-               if (!err)
-                       atomic_inc(&pmc_refcount);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-
-       return err;
-}
-
-void x86_release_hardware(void)
-{
-       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
-               release_pmc_hardware();
-               release_ds_buffers();
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-}
-
-/*
- * Check if we can create event of a certain type (that no conflicting events
- * are present).
- */
-int x86_add_exclusive(unsigned int what)
-{
-       int i;
-
-       if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
-               mutex_lock(&pmc_reserve_mutex);
-               for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
-                       if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
-                               goto fail_unlock;
-               }
-               atomic_inc(&x86_pmu.lbr_exclusive[what]);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-
-       atomic_inc(&active_events);
-       return 0;
-
-fail_unlock:
-       mutex_unlock(&pmc_reserve_mutex);
-       return -EBUSY;
-}
-
-void x86_del_exclusive(unsigned int what)
-{
-       atomic_dec(&x86_pmu.lbr_exclusive[what]);
-       atomic_dec(&active_events);
-}
-
-int x86_setup_perfctr(struct perf_event *event)
-{
-       struct perf_event_attr *attr = &event->attr;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       if (!is_sampling_event(event)) {
-               hwc->sample_period = x86_pmu.max_period;
-               hwc->last_period = hwc->sample_period;
-               local64_set(&hwc->period_left, hwc->sample_period);
-       }
-
-       if (attr->type == PERF_TYPE_RAW)
-               return x86_pmu_extra_regs(event->attr.config, event);
-
-       if (attr->type == PERF_TYPE_HW_CACHE)
-               return set_ext_hw_attr(hwc, event);
-
-       if (attr->config >= x86_pmu.max_events)
-               return -EINVAL;
-
-       /*
-        * The generic map:
-        */
-       config = x86_pmu.event_map(attr->config);
-
-       if (config == 0)
-               return -ENOENT;
-
-       if (config == -1LL)
-               return -EINVAL;
-
-       /*
-        * Branch tracing:
-        */
-       if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-           !attr->freq && hwc->sample_period == 1) {
-               /* BTS is not supported by this architecture. */
-               if (!x86_pmu.bts_active)
-                       return -EOPNOTSUPP;
-
-               /* BTS is currently only allowed for user-mode. */
-               if (!attr->exclude_kernel)
-                       return -EOPNOTSUPP;
-
-               /* disallow bts if conflicting events are present */
-               if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-                       return -EBUSY;
-
-               event->destroy = hw_perf_lbr_event_destroy;
-       }
-
-       hwc->config |= config;
-
-       return 0;
-}
-
-/*
- * check that branch_sample_type is compatible with
- * settings needed for precise_ip > 1 which implies
- * using the LBR to capture ALL taken branches at the
- * priv levels of the measurement
- */
-static inline int precise_br_compat(struct perf_event *event)
-{
-       u64 m = event->attr.branch_sample_type;
-       u64 b = 0;
-
-       /* must capture all branches */
-       if (!(m & PERF_SAMPLE_BRANCH_ANY))
-               return 0;
-
-       m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
-
-       if (!event->attr.exclude_user)
-               b |= PERF_SAMPLE_BRANCH_USER;
-
-       if (!event->attr.exclude_kernel)
-               b |= PERF_SAMPLE_BRANCH_KERNEL;
-
-       /*
-        * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
-        */
-
-       return m == b;
-}
-
-int x86_pmu_hw_config(struct perf_event *event)
-{
-       if (event->attr.precise_ip) {
-               int precise = 0;
-
-               /* Support for constant skid */
-               if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
-                       precise++;
-
-                       /* Support for IP fixup */
-                       if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-                               precise++;
-
-                       if (x86_pmu.pebs_prec_dist)
-                               precise++;
-               }
-
-               if (event->attr.precise_ip > precise)
-                       return -EOPNOTSUPP;
-       }
-       /*
-        * check that PEBS LBR correction does not conflict with
-        * whatever the user is asking with attr->branch_sample_type
-        */
-       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
-               u64 *br_type = &event->attr.branch_sample_type;
-
-               if (has_branch_stack(event)) {
-                       if (!precise_br_compat(event))
-                               return -EOPNOTSUPP;
-
-                       /* branch_sample_type is compatible */
-
-               } else {
-                       /*
-                        * user did not specify  branch_sample_type
-                        *
-                        * For PEBS fixups, we capture all
-                        * the branches at the priv level of the
-                        * event.
-                        */
-                       *br_type = PERF_SAMPLE_BRANCH_ANY;
-
-                       if (!event->attr.exclude_user)
-                               *br_type |= PERF_SAMPLE_BRANCH_USER;
-
-                       if (!event->attr.exclude_kernel)
-                               *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-               }
-       }
-
-       if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
-               event->attach_state |= PERF_ATTACH_TASK_DATA;
-
-       /*
-        * Generate PMC IRQs:
-        * (keep 'enabled' bit clear for now)
-        */
-       event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
-       /*
-        * Count user and OS events unless requested not to
-        */
-       if (!event->attr.exclude_user)
-               event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
-       if (!event->attr.exclude_kernel)
-               event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
-       if (event->attr.type == PERF_TYPE_RAW)
-               event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
-       if (event->attr.sample_period && x86_pmu.limit_period) {
-               if (x86_pmu.limit_period(event, event->attr.sample_period) >
-                               event->attr.sample_period)
-                       return -EINVAL;
-       }
-
-       return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __x86_pmu_event_init(struct perf_event *event)
-{
-       int err;
-
-       if (!x86_pmu_initialized())
-               return -ENODEV;
-
-       err = x86_reserve_hardware();
-       if (err)
-               return err;
-
-       atomic_inc(&active_events);
-       event->destroy = hw_perf_event_destroy;
-
-       event->hw.idx = -1;
-       event->hw.last_cpu = -1;
-       event->hw.last_tag = ~0ULL;
-
-       /* mark unused */
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-       return x86_pmu.hw_config(event);
-}
-
-void x86_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               u64 val;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               rdmsrl(x86_pmu_config_addr(idx), val);
-               if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
-                       continue;
-               val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-               wrmsrl(x86_pmu_config_addr(idx), val);
-       }
-}
-
-static void x86_pmu_disable(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (!x86_pmu_initialized())
-               return;
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->n_added = 0;
-       cpuc->enabled = 0;
-       barrier();
-
-       x86_pmu.disable_all();
-}
-
-void x86_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-       }
-}
-
-static struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
-       return event->pmu == &pmu;
-}
-
-/*
- * Event scheduler state:
- *
- * Assign events iterating over all events and counters, beginning
- * with events with least weights first. Keep the current iterator
- * state in struct sched_state.
- */
-struct sched_state {
-       int     weight;
-       int     event;          /* event index */
-       int     counter;        /* counter index */
-       int     unassigned;     /* number of events to be assigned left */
-       int     nr_gp;          /* number of GP counters used */
-       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-};
-
-/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
-#define        SCHED_STATES_MAX        2
-
-struct perf_sched {
-       int                     max_weight;
-       int                     max_events;
-       int                     max_gp;
-       int                     saved_states;
-       struct event_constraint **constraints;
-       struct sched_state      state;
-       struct sched_state      saved[SCHED_STATES_MAX];
-};
-
-/*
- * Initialize interator that runs through all events and counters.
- */
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-                           int num, int wmin, int wmax, int gpmax)
-{
-       int idx;
-
-       memset(sched, 0, sizeof(*sched));
-       sched->max_events       = num;
-       sched->max_weight       = wmax;
-       sched->max_gp           = gpmax;
-       sched->constraints      = constraints;
-
-       for (idx = 0; idx < num; idx++) {
-               if (constraints[idx]->weight == wmin)
-                       break;
-       }
-
-       sched->state.event      = idx;          /* start with min weight */
-       sched->state.weight     = wmin;
-       sched->state.unassigned = num;
-}
-
-static void perf_sched_save_state(struct perf_sched *sched)
-{
-       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
-               return;
-
-       sched->saved[sched->saved_states] = sched->state;
-       sched->saved_states++;
-}
-
-static bool perf_sched_restore_state(struct perf_sched *sched)
-{
-       if (!sched->saved_states)
-               return false;
-
-       sched->saved_states--;
-       sched->state = sched->saved[sched->saved_states];
-
-       /* continue with next counter: */
-       clear_bit(sched->state.counter++, sched->state.used);
-
-       return true;
-}
-
-/*
- * Select a counter for the current event to schedule. Return true on
- * success.
- */
-static bool __perf_sched_find_counter(struct perf_sched *sched)
-{
-       struct event_constraint *c;
-       int idx;
-
-       if (!sched->state.unassigned)
-               return false;
-
-       if (sched->state.event >= sched->max_events)
-               return false;
-
-       c = sched->constraints[sched->state.event];
-       /* Prefer fixed purpose counters */
-       if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
-               idx = INTEL_PMC_IDX_FIXED;
-               for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
-                       if (!__test_and_set_bit(idx, sched->state.used))
-                               goto done;
-               }
-       }
-
-       /* Grab the first unused counter starting with idx */
-       idx = sched->state.counter;
-       for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-               if (!__test_and_set_bit(idx, sched->state.used)) {
-                       if (sched->state.nr_gp++ >= sched->max_gp)
-                               return false;
-
-                       goto done;
-               }
-       }
-
-       return false;
-
-done:
-       sched->state.counter = idx;
-
-       if (c->overlap)
-               perf_sched_save_state(sched);
-
-       return true;
-}
-
-static bool perf_sched_find_counter(struct perf_sched *sched)
-{
-       while (!__perf_sched_find_counter(sched)) {
-               if (!perf_sched_restore_state(sched))
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * Go through all unassigned events and find the next one to schedule.
- * Take events with the least weight first. Return true on success.
- */
-static bool perf_sched_next_event(struct perf_sched *sched)
-{
-       struct event_constraint *c;
-
-       if (!sched->state.unassigned || !--sched->state.unassigned)
-               return false;
-
-       do {
-               /* next event */
-               sched->state.event++;
-               if (sched->state.event >= sched->max_events) {
-                       /* next weight */
-                       sched->state.event = 0;
-                       sched->state.weight++;
-                       if (sched->state.weight > sched->max_weight)
-                               return false;
-               }
-               c = sched->constraints[sched->state.event];
-       } while (c->weight != sched->state.weight);
-
-       sched->state.counter = 0;       /* start with first counter */
-
-       return true;
-}
-
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int gpmax, int *assign)
-{
-       struct perf_sched sched;
-
-       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
-
-       do {
-               if (!perf_sched_find_counter(&sched))
-                       break;  /* failed */
-               if (assign)
-                       assign[sched.state.event] = sched.state.counter;
-       } while (perf_sched_next_event(&sched));
-
-       return sched.state.unassigned;
-}
-EXPORT_SYMBOL_GPL(perf_assign_events);
-
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-       struct event_constraint *c;
-       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       struct perf_event *e;
-       int i, wmin, wmax, unsched = 0;
-       struct hw_perf_event *hwc;
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
-       if (x86_pmu.start_scheduling)
-               x86_pmu.start_scheduling(cpuc);
-
-       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               cpuc->event_constraint[i] = NULL;
-               c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-               cpuc->event_constraint[i] = c;
-
-               wmin = min(wmin, c->weight);
-               wmax = max(wmax, c->weight);
-       }
-
-       /*
-        * fastpath, try to reuse previous register
-        */
-       for (i = 0; i < n; i++) {
-               hwc = &cpuc->event_list[i]->hw;
-               c = cpuc->event_constraint[i];
-
-               /* never assigned */
-               if (hwc->idx == -1)
-                       break;
-
-               /* constraint still honored */
-               if (!test_bit(hwc->idx, c->idxmsk))
-                       break;
-
-               /* not already used */
-               if (test_bit(hwc->idx, used_mask))
-                       break;
-
-               __set_bit(hwc->idx, used_mask);
-               if (assign)
-                       assign[i] = hwc->idx;
-       }
-
-       /* slow path */
-       if (i != n) {
-               int gpmax = x86_pmu.num_counters;
-
-               /*
-                * Do not allow scheduling of more than half the available
-                * generic counters.
-                *
-                * This helps avoid counter starvation of sibling thread by
-                * ensuring at most half the counters cannot be in exclusive
-                * mode. There is no designated counters for the limits. Any
-                * N/2 counters can be used. This helps with events with
-                * specific counter constraints.
-                */
-               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
-                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
-                       gpmax /= 2;
-
-               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-                                            wmax, gpmax, assign);
-       }
-
-       /*
-        * In case of success (unsched = 0), mark events as committed,
-        * so we do not put_constraint() in case new events are added
-        * and fail to be scheduled
-        *
-        * We invoke the lower level commit callback to lock the resource
-        *
-        * We do not need to do all of this in case we are called to
-        * validate an event group (assign == NULL)
-        */
-       if (!unsched && assign) {
-               for (i = 0; i < n; i++) {
-                       e = cpuc->event_list[i];
-                       e->hw.flags |= PERF_X86_EVENT_COMMITTED;
-                       if (x86_pmu.commit_scheduling)
-                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
-               }
-       } else {
-               for (i = 0; i < n; i++) {
-                       e = cpuc->event_list[i];
-                       /*
-                        * do not put_constraint() on comitted events,
-                        * because they are good to go
-                        */
-                       if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
-                               continue;
-
-                       /*
-                        * release events that failed scheduling
-                        */
-                       if (x86_pmu.put_event_constraints)
-                               x86_pmu.put_event_constraints(cpuc, e);
-               }
-       }
-
-       if (x86_pmu.stop_scheduling)
-               x86_pmu.stop_scheduling(cpuc);
-
-       return unsched ? -EINVAL : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
-       struct perf_event *event;
-       int n, max_count;
-
-       max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
-       /* current number of events already accepted */
-       n = cpuc->n_events;
-
-       if (is_x86_event(leader)) {
-               if (n >= max_count)
-                       return -EINVAL;
-               cpuc->event_list[n] = leader;
-               n++;
-       }
-       if (!dogrp)
-               return n;
-
-       list_for_each_entry(event, &leader->sibling_list, group_entry) {
-               if (!is_x86_event(event) ||
-                   event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-
-               if (n >= max_count)
-                       return -EINVAL;
-
-               cpuc->event_list[n] = event;
-               n++;
-       }
-       return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
-                               struct cpu_hw_events *cpuc, int i)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hwc->idx = cpuc->assign[i];
-       hwc->last_cpu = smp_processor_id();
-       hwc->last_tag = ++cpuc->tags[i];
-
-       if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
-               hwc->config_base = 0;
-               hwc->event_base = 0;
-       } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
-               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
-               hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
-       } else {
-               hwc->config_base = x86_pmu_config_addr(hwc->idx);
-               hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-               hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
-       }
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
-                                       struct cpu_hw_events *cpuc,
-                                       int i)
-{
-       return hwc->idx == cpuc->assign[i] &&
-               hwc->last_cpu == smp_processor_id() &&
-               hwc->last_tag == cpuc->tags[i];
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags);
-
-static void x86_pmu_enable(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_event *event;
-       struct hw_perf_event *hwc;
-       int i, added = cpuc->n_added;
-
-       if (!x86_pmu_initialized())
-               return;
-
-       if (cpuc->enabled)
-               return;
-
-       if (cpuc->n_added) {
-               int n_running = cpuc->n_events - cpuc->n_added;
-               /*
-                * apply assignment obtained either from
-                * hw_perf_group_sched_in() or x86_pmu_enable()
-                *
-                * step1: save events moving to new counters
-                */
-               for (i = 0; i < n_running; i++) {
-                       event = cpuc->event_list[i];
-                       hwc = &event->hw;
-
-                       /*
-                        * we can avoid reprogramming counter if:
-                        * - assigned same counter as last time
-                        * - running on same CPU as last time
-                        * - no other event has used the counter since
-                        */
-                       if (hwc->idx == -1 ||
-                           match_prev_assignment(hwc, cpuc, i))
-                               continue;
-
-                       /*
-                        * Ensure we don't accidentally enable a stopped
-                        * counter simply because we rescheduled.
-                        */
-                       if (hwc->state & PERF_HES_STOPPED)
-                               hwc->state |= PERF_HES_ARCH;
-
-                       x86_pmu_stop(event, PERF_EF_UPDATE);
-               }
-
-               /*
-                * step2: reprogram moved events into new counters
-                */
-               for (i = 0; i < cpuc->n_events; i++) {
-                       event = cpuc->event_list[i];
-                       hwc = &event->hw;
-
-                       if (!match_prev_assignment(hwc, cpuc, i))
-                               x86_assign_hw_event(event, cpuc, i);
-                       else if (i < n_running)
-                               continue;
-
-                       if (hwc->state & PERF_HES_ARCH)
-                               continue;
-
-                       x86_pmu_start(event, PERF_EF_RELOAD);
-               }
-               cpuc->n_added = 0;
-               perf_events_lapic_init();
-       }
-
-       cpuc->enabled = 1;
-       barrier();
-
-       x86_pmu.enable_all(added);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-int x86_perf_event_set_period(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       s64 left = local64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int ret = 0, idx = hwc->idx;
-
-       if (idx == INTEL_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * If we are way outside a reasonable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-
-       if (unlikely(left <= 0)) {
-               left += period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-       /*
-        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-        */
-       if (unlikely(left < 2))
-               left = 2;
-
-       if (left > x86_pmu.max_period)
-               left = x86_pmu.max_period;
-
-       if (x86_pmu.limit_period)
-               left = x86_pmu.limit_period(event, left);
-
-       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
-           local64_read(&hwc->prev_count) != (u64)-left) {
-               /*
-                * The hw event starts counting from this event offset,
-                * mark it to be able to extra future deltas:
-                */
-               local64_set(&hwc->prev_count, (u64)-left);
-
-               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
-       }
-
-       /*
-        * Due to erratum on certan cpu we need
-        * a second write to be sure the register
-        * is updated properly
-        */
-       if (x86_pmu.perfctr_second_write) {
-               wrmsrl(hwc->event_base,
-                       (u64)(-left) & x86_pmu.cntval_mask);
-       }
-
-       perf_event_update_userpage(event);
-
-       return ret;
-}
-
-void x86_pmu_enable_event(struct perf_event *event)
-{
-       if (__this_cpu_read(cpu_hw_events.enabled))
-               __x86_pmu_enable_event(&event->hw,
-                                      ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Add a single event to the PMU.
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- */
-static int x86_pmu_add(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc;
-       int assign[X86_PMC_IDX_MAX];
-       int n, n0, ret;
-
-       hwc = &event->hw;
-
-       n0 = cpuc->n_events;
-       ret = n = collect_events(cpuc, event, false);
-       if (ret < 0)
-               goto out;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       /*
-        * If group events scheduling transaction was started,
-        * skip the schedulability test here, it will be performed
-        * at commit time (->commit_txn) as a whole.
-        */
-       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               goto done_collect;
-
-       ret = x86_pmu.schedule_events(cpuc, n, assign);
-       if (ret)
-               goto out;
-       /*
-        * copy new assignment, now we know it is possible
-        * will be used by hw_perf_enable()
-        */
-       memcpy(cpuc->assign, assign, n*sizeof(int));
-
-done_collect:
-       /*
-        * Commit the collect_events() state. See x86_pmu_del() and
-        * x86_pmu_*_txn().
-        */
-       cpuc->n_events = n;
-       cpuc->n_added += n - n0;
-       cpuc->n_txn += n - n0;
-
-       ret = 0;
-out:
-       return ret;
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx = event->hw.idx;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       if (WARN_ON_ONCE(idx == -1))
-               return;
-
-       if (flags & PERF_EF_RELOAD) {
-               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-               x86_perf_event_set_period(event);
-       }
-
-       event->hw.state = 0;
-
-       cpuc->events[idx] = event;
-       __set_bit(idx, cpuc->active_mask);
-       __set_bit(idx, cpuc->running);
-       x86_pmu.enable(event);
-       perf_event_update_userpage(event);
-}
-
-void perf_event_print_debug(void)
-{
-       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-       u64 pebs, debugctl;
-       struct cpu_hw_events *cpuc;
-       unsigned long flags;
-       int cpu, idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       if (x86_pmu.version >= 2) {
-               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-               pr_info("\n");
-               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-               if (x86_pmu.pebs_constraints) {
-                       rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-                       pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
-               }
-               if (x86_pmu.lbr_nr) {
-                       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-                       pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-               }
-       }
-       pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
-               rdmsrl(x86_pmu_event_addr(idx), pmc_count);
-
-               prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-                       cpu, idx, pmc_ctrl);
-               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-                       cpu, idx, prev_left);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-       }
-       local_irq_restore(flags);
-}
-
-void x86_pmu_stop(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
-               x86_pmu.disable(event);
-               cpuc->events[hwc->idx] = NULL;
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               x86_perf_event_update(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static void x86_pmu_del(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int i;
-
-       /*
-        * event is descheduled
-        */
-       event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
-       /*
-        * If we're called during a txn, we don't need to do anything.
-        * The events never got scheduled and ->cancel_txn will truncate
-        * the event_list.
-        *
-        * XXX assumes any ->del() called during a TXN will only be on
-        * an event added during that same TXN.
-        */
-       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               return;
-
-       /*
-        * Not a TXN, therefore cleanup properly.
-        */
-       x86_pmu_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < cpuc->n_events; i++) {
-               if (event == cpuc->event_list[i])
-                       break;
-       }
-
-       if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
-               return;
-
-       /* If we have a newly added event; make sure to decrease n_added. */
-       if (i >= cpuc->n_events - cpuc->n_added)
-               --cpuc->n_added;
-
-       if (x86_pmu.put_event_constraints)
-               x86_pmu.put_event_constraints(cpuc, event);
-
-       /* Delete the array entry. */
-       while (++i < cpuc->n_events) {
-               cpuc->event_list[i-1] = cpuc->event_list[i];
-               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
-       }
-       --cpuc->n_events;
-
-       perf_event_update_userpage(event);
-}
-
-int x86_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       struct perf_event *event;
-       int idx, handled = 0;
-       u64 val;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * Some chipsets need to unmask the LVTPC in a particular spot
-        * inside the nmi handler.  As a result, the unmasking was pushed
-        * into all the nmi handlers.
-        *
-        * This generic handler doesn't seem to have any issues where the
-        * unmasking occurs so it was left at the top.
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               if (!test_bit(idx, cpuc->active_mask)) {
-                       /*
-                        * Though we deactivated the counter some cpus
-                        * might still deliver spurious interrupts still
-                        * in flight. Catch them:
-                        */
-                       if (__test_and_clear_bit(idx, cpuc->running))
-                               handled++;
-                       continue;
-               }
-
-               event = cpuc->events[idx];
-
-               val = x86_perf_event_update(event);
-               if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
-                       continue;
-
-               /*
-                * event overflow
-                */
-               handled++;
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (!x86_perf_event_set_period(event))
-                       continue;
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-
-void perf_events_lapic_init(void)
-{
-       if (!x86_pmu.apic || !x86_pmu_initialized())
-               return;
-
-       /*
-        * Always use NMI for PMU
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int
-perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-       u64 start_clock;
-       u64 finish_clock;
-       int ret;
-
-       /*
-        * All PMUs/events that share this PMI handler should make sure to
-        * increment active_events for their events.
-        */
-       if (!atomic_read(&active_events))
-               return NMI_DONE;
-
-       start_clock = sched_clock();
-       ret = x86_pmu.handle_irq(regs);
-       finish_clock = sched_clock();
-
-       perf_sample_event_took(finish_clock - start_clock);
-
-       return ret;
-}
-NOKPROBE_SYMBOL(perf_event_nmi_handler);
-
-struct event_constraint emptyconstraint;
-struct event_constraint unconstrained;
-
-static int
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       int i, ret = NOTIFY_OK;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
-                       cpuc->kfree_on_online[i] = NULL;
-               if (x86_pmu.cpu_prepare)
-                       ret = x86_pmu.cpu_prepare(cpu);
-               break;
-
-       case CPU_STARTING:
-               if (x86_pmu.cpu_starting)
-                       x86_pmu.cpu_starting(cpu);
-               break;
-
-       case CPU_ONLINE:
-               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
-                       kfree(cpuc->kfree_on_online[i]);
-                       cpuc->kfree_on_online[i] = NULL;
-               }
-               break;
-
-       case CPU_DYING:
-               if (x86_pmu.cpu_dying)
-                       x86_pmu.cpu_dying(cpu);
-               break;
-
-       case CPU_UP_CANCELED:
-       case CPU_DEAD:
-               if (x86_pmu.cpu_dead)
-                       x86_pmu.cpu_dead(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
-       if (cpu_has_apic)
-               return;
-
-       x86_pmu.apic = 0;
-       pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-       pr_info("no hardware sampling interrupt available.\n");
-
-       /*
-        * If we have a PMU initialized but no APIC
-        * interrupts, we cannot sample hardware
-        * events (user-space has to fall back and
-        * sample via a hrtimer based software event):
-        */
-       pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-}
-
-static struct attribute_group x86_pmu_format_group = {
-       .name = "format",
-       .attrs = NULL,
-};
-
-/*
- * Remove all undefined events (x86_pmu.event_map(id) == 0)
- * out of events_attr attributes.
- */
-static void __init filter_events(struct attribute **attrs)
-{
-       struct device_attribute *d;
-       struct perf_pmu_events_attr *pmu_attr;
-       int offset = 0;
-       int i, j;
-
-       for (i = 0; attrs[i]; i++) {
-               d = (struct device_attribute *)attrs[i];
-               pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
-               /* str trumps id */
-               if (pmu_attr->event_str)
-                       continue;
-               if (x86_pmu.event_map(i + offset))
-                       continue;
-
-               for (j = i; attrs[j]; j++)
-                       attrs[j] = attrs[j + 1];
-
-               /* Check the shifted attr. */
-               i--;
-
-               /*
-                * event_map() is index based, the attrs array is organized
-                * by increasing event index. If we shift the events, then
-                * we need to compensate for the event_map(), otherwise
-                * we are looking up the wrong event in the map
-                */
-               offset++;
-       }
-}
-
-/* Merge two pointer arrays */
-__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
-{
-       struct attribute **new;
-       int j, i;
-
-       for (j = 0; a[j]; j++)
-               ;
-       for (i = 0; b[i]; i++)
-               j++;
-       j++;
-
-       new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
-       if (!new)
-               return NULL;
-
-       j = 0;
-       for (i = 0; a[i]; i++)
-               new[j++] = a[i];
-       for (i = 0; b[i]; i++)
-               new[j++] = b[i];
-       new[j] = NULL;
-
-       return new;
-}
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-                         char *page)
-{
-       struct perf_pmu_events_attr *pmu_attr = \
-               container_of(attr, struct perf_pmu_events_attr, attr);
-       u64 config = x86_pmu.event_map(pmu_attr->id);
-
-       /* string trumps id */
-       if (pmu_attr->event_str)
-               return sprintf(page, "%s", pmu_attr->event_str);
-
-       return x86_pmu.events_sysfs_show(page, config);
-}
-
-EVENT_ATTR(cpu-cycles,                 CPU_CYCLES              );
-EVENT_ATTR(instructions,               INSTRUCTIONS            );
-EVENT_ATTR(cache-references,           CACHE_REFERENCES        );
-EVENT_ATTR(cache-misses,               CACHE_MISSES            );
-EVENT_ATTR(branch-instructions,                BRANCH_INSTRUCTIONS     );
-EVENT_ATTR(branch-misses,              BRANCH_MISSES           );
-EVENT_ATTR(bus-cycles,                 BUS_CYCLES              );
-EVENT_ATTR(stalled-cycles-frontend,    STALLED_CYCLES_FRONTEND );
-EVENT_ATTR(stalled-cycles-backend,     STALLED_CYCLES_BACKEND  );
-EVENT_ATTR(ref-cycles,                 REF_CPU_CYCLES          );
-
-static struct attribute *empty_attrs;
-
-static struct attribute *events_attr[] = {
-       EVENT_PTR(CPU_CYCLES),
-       EVENT_PTR(INSTRUCTIONS),
-       EVENT_PTR(CACHE_REFERENCES),
-       EVENT_PTR(CACHE_MISSES),
-       EVENT_PTR(BRANCH_INSTRUCTIONS),
-       EVENT_PTR(BRANCH_MISSES),
-       EVENT_PTR(BUS_CYCLES),
-       EVENT_PTR(STALLED_CYCLES_FRONTEND),
-       EVENT_PTR(STALLED_CYCLES_BACKEND),
-       EVENT_PTR(REF_CPU_CYCLES),
-       NULL,
-};
-
-static struct attribute_group x86_pmu_events_group = {
-       .name = "events",
-       .attrs = events_attr,
-};
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
-{
-       u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-       u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
-       bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
-       bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
-       bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
-       bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
-       ssize_t ret;
-
-       /*
-       * We have whole page size to spend and just little data
-       * to write, so we can safely use sprintf.
-       */
-       ret = sprintf(page, "event=0x%02llx", event);
-
-       if (umask)
-               ret += sprintf(page + ret, ",umask=0x%02llx", umask);
-
-       if (edge)
-               ret += sprintf(page + ret, ",edge");
-
-       if (pc)
-               ret += sprintf(page + ret, ",pc");
-
-       if (any)
-               ret += sprintf(page + ret, ",any");
-
-       if (inv)
-               ret += sprintf(page + ret, ",inv");
-
-       if (cmask)
-               ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
-
-       ret += sprintf(page + ret, "\n");
-
-       return ret;
-}
-
-static int __init init_hw_perf_events(void)
-{
-       struct x86_pmu_quirk *quirk;
-       int err;
-
-       pr_info("Performance Events: ");
-
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_INTEL:
-               err = intel_pmu_init();
-               break;
-       case X86_VENDOR_AMD:
-               err = amd_pmu_init();
-               break;
-       default:
-               err = -ENOTSUPP;
-       }
-       if (err != 0) {
-               pr_cont("no PMU driver, software events only.\n");
-               return 0;
-       }
-
-       pmu_check_apic();
-
-       /* sanity check that the hardware exists or is emulated */
-       if (!check_hw_exists())
-               return 0;
-
-       pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-       x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
-
-       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
-               quirk->func();
-
-       if (!x86_pmu.intel_ctrl)
-               x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-       perf_events_lapic_init();
-       register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
-
-       unconstrained = (struct event_constraint)
-               __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-                                  0, x86_pmu.num_counters, 0, 0);
-
-       x86_pmu_format_group.attrs = x86_pmu.format_attrs;
-
-       if (x86_pmu.event_attrs)
-               x86_pmu_events_group.attrs = x86_pmu.event_attrs;
-
-       if (!x86_pmu.events_sysfs_show)
-               x86_pmu_events_group.attrs = &empty_attrs;
-       else
-               filter_events(x86_pmu_events_group.attrs);
-
-       if (x86_pmu.cpu_events) {
-               struct attribute **tmp;
-
-               tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
-               if (!WARN_ON(!tmp))
-                       x86_pmu_events_group.attrs = tmp;
-       }
-
-       pr_info("... version:                %d\n",     x86_pmu.version);
-       pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-       pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
-       pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
-       pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-       pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
-       pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
-
-       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
-       perf_cpu_notifier(x86_pmu_notifier);
-
-       return 0;
-}
-early_initcall(init_hw_perf_events);
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
-       x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
- */
-static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
-
-       cpuc->txn_flags = txn_flags;
-       if (txn_flags & ~PERF_PMU_TXN_ADD)
-               return;
-
-       perf_pmu_disable(pmu);
-       __this_cpu_write(cpu_hw_events.n_txn, 0);
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(struct pmu *pmu)
-{
-       unsigned int txn_flags;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
-
-       txn_flags = cpuc->txn_flags;
-       cpuc->txn_flags = 0;
-       if (txn_flags & ~PERF_PMU_TXN_ADD)
-               return;
-
-       /*
-        * Truncate collected array by the number of events added in this
-        * transaction. See x86_pmu_add() and x86_pmu_*_txn().
-        */
-       __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
-       __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
-       perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- *
- * Does not cancel the transaction on failure; expects the caller to do this.
- */
-static int x86_pmu_commit_txn(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int assign[X86_PMC_IDX_MAX];
-       int n, ret;
-
-       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
-
-       if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
-               cpuc->txn_flags = 0;
-               return 0;
-       }
-
-       n = cpuc->n_events;
-
-       if (!x86_pmu_initialized())
-               return -EAGAIN;
-
-       ret = x86_pmu.schedule_events(cpuc, n, assign);
-       if (ret)
-               return ret;
-
-       /*
-        * copy new assignment, now we know it is possible
-        * will be used by hw_perf_enable()
-        */
-       memcpy(cpuc->assign, assign, n*sizeof(int));
-
-       cpuc->txn_flags = 0;
-       perf_pmu_enable(pmu);
-       return 0;
-}
-/*
- * a fake_cpuc is used to validate event groups. Due to
- * the extra reg logic, we need to also allocate a fake
- * per_core and per_cpu structure. Otherwise, group events
- * using extra reg may conflict without the kernel being
- * able to catch this when the last event gets added to
- * the group.
- */
-static void free_fake_cpuc(struct cpu_hw_events *cpuc)
-{
-       kfree(cpuc->shared_regs);
-       kfree(cpuc);
-}
-
-static struct cpu_hw_events *allocate_fake_cpuc(void)
-{
-       struct cpu_hw_events *cpuc;
-       int cpu = raw_smp_processor_id();
-
-       cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
-       if (!cpuc)
-               return ERR_PTR(-ENOMEM);
-
-       /* only needed, if we have extra_regs */
-       if (x86_pmu.extra_regs) {
-               cpuc->shared_regs = allocate_shared_regs(cpu);
-               if (!cpuc->shared_regs)
-                       goto error;
-       }
-       cpuc->is_fake = 1;
-       return cpuc;
-error:
-       free_fake_cpuc(cpuc);
-       return ERR_PTR(-ENOMEM);
-}
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
-       struct cpu_hw_events *fake_cpuc;
-       struct event_constraint *c;
-       int ret = 0;
-
-       fake_cpuc = allocate_fake_cpuc();
-       if (IS_ERR(fake_cpuc))
-               return PTR_ERR(fake_cpuc);
-
-       c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
-
-       if (!c || !c->weight)
-               ret = -EINVAL;
-
-       if (x86_pmu.put_event_constraints)
-               x86_pmu.put_event_constraints(fake_cpuc, event);
-
-       free_fake_cpuc(fake_cpuc);
-
-       return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- *     - check events are compatible which each other
- *     - events do not compete for the same counter
- *     - number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
-       struct perf_event *leader = event->group_leader;
-       struct cpu_hw_events *fake_cpuc;
-       int ret = -EINVAL, n;
-
-       fake_cpuc = allocate_fake_cpuc();
-       if (IS_ERR(fake_cpuc))
-               return PTR_ERR(fake_cpuc);
-       /*
-        * the event is not yet connected with its
-        * siblings therefore we must first collect
-        * existing siblings, then add the new event
-        * before we can simulate the scheduling
-        */
-       n = collect_events(fake_cpuc, leader, true);
-       if (n < 0)
-               goto out;
-
-       fake_cpuc->n_events = n;
-       n = collect_events(fake_cpuc, event, false);
-       if (n < 0)
-               goto out;
-
-       fake_cpuc->n_events = n;
-
-       ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out:
-       free_fake_cpuc(fake_cpuc);
-       return ret;
-}
-
-static int x86_pmu_event_init(struct perf_event *event)
-{
-       struct pmu *tmp;
-       int err;
-
-       switch (event->attr.type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               break;
-
-       default:
-               return -ENOENT;
-       }
-
-       err = __x86_pmu_event_init(event);
-       if (!err) {
-               /*
-                * we temporarily connect event to its pmu
-                * such that validate_group() can classify
-                * it as an x86 event using is_x86_event()
-                */
-               tmp = event->pmu;
-               event->pmu = &pmu;
-
-               if (event->group_leader != event)
-                       err = validate_group(event);
-               else
-                       err = validate_event(event);
-
-               event->pmu = tmp;
-       }
-       if (err) {
-               if (event->destroy)
-                       event->destroy(event);
-       }
-
-       if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
-               event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
-
-       return err;
-}
-
-static void refresh_pce(void *ignored)
-{
-       if (current->mm)
-               load_mm_cr4(current->mm);
-}
-
-static void x86_pmu_event_mapped(struct perf_event *event)
-{
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return;
-
-       if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static void x86_pmu_event_unmapped(struct perf_event *event)
-{
-       if (!current->mm)
-               return;
-
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return;
-
-       if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static int x86_pmu_event_idx(struct perf_event *event)
-{
-       int idx = event->hw.idx;
-
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return 0;
-
-       if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
-               idx -= INTEL_PMC_IDX_FIXED;
-               idx |= 1 << 30;
-       }
-
-       return idx + 1;
-}
-
-static ssize_t get_attr_rdpmc(struct device *cdev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
-}
-
-static ssize_t set_attr_rdpmc(struct device *cdev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-       unsigned long val;
-       ssize_t ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-
-       if (val > 2)
-               return -EINVAL;
-
-       if (x86_pmu.attr_rdpmc_broken)
-               return -ENOTSUPP;
-
-       if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
-               /*
-                * Changing into or out of always available, aka
-                * perf-event-bypassing mode.  This path is extremely slow,
-                * but only root can trigger it, so it's okay.
-                */
-               if (val == 2)
-                       static_key_slow_inc(&rdpmc_always_available);
-               else
-                       static_key_slow_dec(&rdpmc_always_available);
-               on_each_cpu(refresh_pce, NULL, 1);
-       }
-
-       x86_pmu.attr_rdpmc = val;
-
-       return count;
-}
-
-static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
-
-static struct attribute *x86_pmu_attrs[] = {
-       &dev_attr_rdpmc.attr,
-       NULL,
-};
-
-static struct attribute_group x86_pmu_attr_group = {
-       .attrs = x86_pmu_attrs,
-};
-
-static const struct attribute_group *x86_pmu_attr_groups[] = {
-       &x86_pmu_attr_group,
-       &x86_pmu_format_group,
-       &x86_pmu_events_group,
-       NULL,
-};
-
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       if (x86_pmu.sched_task)
-               x86_pmu.sched_task(ctx, sched_in);
-}
-
-void perf_check_microcode(void)
-{
-       if (x86_pmu.check_microcode)
-               x86_pmu.check_microcode();
-}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
-
-static struct pmu pmu = {
-       .pmu_enable             = x86_pmu_enable,
-       .pmu_disable            = x86_pmu_disable,
-
-       .attr_groups            = x86_pmu_attr_groups,
-
-       .event_init             = x86_pmu_event_init,
-
-       .event_mapped           = x86_pmu_event_mapped,
-       .event_unmapped         = x86_pmu_event_unmapped,
-
-       .add                    = x86_pmu_add,
-       .del                    = x86_pmu_del,
-       .start                  = x86_pmu_start,
-       .stop                   = x86_pmu_stop,
-       .read                   = x86_pmu_read,
-
-       .start_txn              = x86_pmu_start_txn,
-       .cancel_txn             = x86_pmu_cancel_txn,
-       .commit_txn             = x86_pmu_commit_txn,
-
-       .event_idx              = x86_pmu_event_idx,
-       .sched_task             = x86_pmu_sched_task,
-       .task_ctx_size          = sizeof(struct x86_perf_task_context),
-};
-
-void arch_perf_update_userpage(struct perf_event *event,
-                              struct perf_event_mmap_page *userpg, u64 now)
-{
-       struct cyc2ns_data *data;
-
-       userpg->cap_user_time = 0;
-       userpg->cap_user_time_zero = 0;
-       userpg->cap_user_rdpmc =
-               !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
-       userpg->pmc_width = x86_pmu.cntval_bits;
-
-       if (!sched_clock_stable())
-               return;
-
-       data = cyc2ns_read_begin();
-
-       /*
-        * Internal timekeeping for enabled/running/stopped times
-        * is always in the local_clock domain.
-        */
-       userpg->cap_user_time = 1;
-       userpg->time_mult = data->cyc2ns_mul;
-       userpg->time_shift = data->cyc2ns_shift;
-       userpg->time_offset = data->cyc2ns_offset - now;
-
-       /*
-        * cap_user_time_zero doesn't make sense when we're using a different
-        * time base for the records.
-        */
-       if (event->clock == &local_clock) {
-               userpg->cap_user_time_zero = 1;
-               userpg->time_zero = data->cyc2ns_offset;
-       }
-
-       cyc2ns_read_end(data);
-}
-
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
-       return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-       struct perf_callchain_entry *entry = data;
-
-       perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-       .stack                  = backtrace_stack,
-       .address                = backtrace_address,
-       .walk_stack             = print_context_stack_bp,
-};
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return;
-       }
-
-       perf_callchain_store(entry, regs->ip);
-
-       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-static inline int
-valid_user_frame(const void __user *fp, unsigned long size)
-{
-       return (__range_not_ok(fp, size, TASK_SIZE) == 0);
-}
-
-static unsigned long get_segment_base(unsigned int segment)
-{
-       struct desc_struct *desc;
-       int idx = segment >> 3;
-
-       if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-               struct ldt_struct *ldt;
-
-               if (idx > LDT_ENTRIES)
-                       return 0;
-
-               /* IRQs are off, so this synchronizes with smp_store_release */
-               ldt = lockless_dereference(current->active_mm->context.ldt);
-               if (!ldt || idx > ldt->size)
-                       return 0;
-
-               desc = &ldt->entries[idx];
-#else
-               return 0;
-#endif
-       } else {
-               if (idx > GDT_ENTRIES)
-                       return 0;
-
-               desc = raw_cpu_ptr(gdt_page.gdt) + idx;
-       }
-
-       return get_desc_base(desc);
-}
-
-#ifdef CONFIG_IA32_EMULATION
-
-#include <asm/compat.h>
-
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       /* 32-bit process in 64-bit kernel. */
-       unsigned long ss_base, cs_base;
-       struct stack_frame_ia32 frame;
-       const void __user *fp;
-
-       if (!test_thread_flag(TIF_IA32))
-               return 0;
-
-       cs_base = get_segment_base(regs->cs);
-       ss_base = get_segment_base(regs->ss);
-
-       fp = compat_ptr(ss_base + regs->bp);
-       pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               unsigned long bytes;
-               frame.next_frame     = 0;
-               frame.return_address = 0;
-
-               if (!access_ok(VERIFY_READ, fp, 8))
-                       break;
-
-               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-               if (bytes != 0)
-                       break;
-               bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-               if (bytes != 0)
-                       break;
-
-               if (!valid_user_frame(fp, sizeof(frame)))
-                       break;
-
-               perf_callchain_store(entry, cs_base + frame.return_address);
-               fp = compat_ptr(ss_base + frame.next_frame);
-       }
-       pagefault_enable();
-       return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-    return 0;
-}
-#endif
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-       struct stack_frame frame;
-       const void __user *fp;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return;
-       }
-
-       /*
-        * We don't know what to do with VM86 stacks.. ignore them for now.
-        */
-       if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
-               return;
-
-       fp = (void __user *)regs->bp;
-
-       perf_callchain_store(entry, regs->ip);
-
-       if (!current->mm)
-               return;
-
-       if (perf_callchain_user32(regs, entry))
-               return;
-
-       pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               unsigned long bytes;
-               frame.next_frame             = NULL;
-               frame.return_address = 0;
-
-               if (!access_ok(VERIFY_READ, fp, 16))
-                       break;
-
-               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
-               if (bytes != 0)
-                       break;
-               bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
-               if (bytes != 0)
-                       break;
-
-               if (!valid_user_frame(fp, sizeof(frame)))
-                       break;
-
-               perf_callchain_store(entry, frame.return_address);
-               fp = (void __user *)frame.next_frame;
-       }
-       pagefault_enable();
-}
-
-/*
- * Deal with code segment offsets for the various execution modes:
- *
- *   VM86 - the good olde 16 bit days, where the linear address is
- *          20 bits and we use regs->ip + 0x10 * regs->cs.
- *
- *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
- *          to figure out what the 32bit base address is.
- *
- *    X32 - has TIF_X32 set, but is running in x86_64
- *
- * X86_64 - CS,DS,SS,ES are all zero based.
- */
-static unsigned long code_segment_base(struct pt_regs *regs)
-{
-       /*
-        * For IA32 we look at the GDT/LDT segment base to convert the
-        * effective IP to a linear address.
-        */
-
-#ifdef CONFIG_X86_32
-       /*
-        * If we are in VM86 mode, add the segment offset to convert to a
-        * linear address.
-        */
-       if (regs->flags & X86_VM_MASK)
-               return 0x10 * regs->cs;
-
-       if (user_mode(regs) && regs->cs != __USER_CS)
-               return get_segment_base(regs->cs);
-#else
-       if (user_mode(regs) && !user_64bit_mode(regs) &&
-           regs->cs != __USER32_CS)
-               return get_segment_base(regs->cs);
-#endif
-       return 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-               return perf_guest_cbs->get_guest_ip();
-
-       return regs->ip + code_segment_base(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-       int misc = 0;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               if (perf_guest_cbs->is_user_mode())
-                       misc |= PERF_RECORD_MISC_GUEST_USER;
-               else
-                       misc |= PERF_RECORD_MISC_GUEST_KERNEL;
-       } else {
-               if (user_mode(regs))
-                       misc |= PERF_RECORD_MISC_USER;
-               else
-                       misc |= PERF_RECORD_MISC_KERNEL;
-       }
-
-       if (regs->flags & PERF_EFLAGS_EXACT)
-               misc |= PERF_RECORD_MISC_EXACT_IP;
-
-       return misc;
-}
-
-void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
-{
-       cap->version            = x86_pmu.version;
-       cap->num_counters_gp    = x86_pmu.num_counters;
-       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
-       cap->bit_width_gp       = x86_pmu.cntval_bits;
-       cap->bit_width_fixed    = x86_pmu.cntval_bits;
-       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
-       cap->events_mask_len    = x86_pmu.events_mask_len;
-}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h

deleted file mode 100644 (file)

index 7bb61e3..0000000
--- a/arch/x86/kernel/cpu/perf_event.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Performance events x86 architecture header
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-/* To enable MSR tracing please use the generic trace points. */
-
-/*
- *          |   NHM/WSM    |      SNB     |
- * register -------------------------------
- *          |  HT  | no HT |  HT  | no HT |
- *-----------------------------------------
- * offcore  | core | core  | cpu  | core  |
- * lbr_sel  | core | core  | cpu  | core  |
- * ld_lat   | cpu  | core  | cpu  | core  |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
-       EXTRA_REG_NONE  = -1,   /* not used */
-
-       EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
-       EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
-       EXTRA_REG_LBR   = 2,    /* lbr_select */
-       EXTRA_REG_LDLAT = 3,    /* ld_lat_threshold */
-       EXTRA_REG_FE    = 4,    /* fe_* */
-
-       EXTRA_REG_MAX           /* number of entries needed */
-};
-
-struct event_constraint {
-       union {
-               unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-               u64             idxmsk64;
-       };
-       u64     code;
-       u64     cmask;
-       int     weight;
-       int     overlap;
-       int     flags;
-};
-/*
- * struct hw_perf_event.flags flags
- */
-#define PERF_X86_EVENT_PEBS_LDLAT      0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST         0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW     0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED       0x0008 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW     0x0010 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW     0x0020 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
-
-
-struct amd_nb {
-       int nb_id;  /* NorthBridge id */
-       int refcnt; /* reference count */
-       struct perf_event *owners[X86_PMC_IDX_MAX];
-       struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS                8
-
-/*
- * Flags PEBS can handle without an PMI.
- *
- * TID can only be handled by flushing at context switch.
- *
- */
-#define PEBS_FREERUNNING_FLAGS \
-       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
-       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
-       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-       PERF_SAMPLE_TRANSACTION)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-       u64     bts_buffer_base;
-       u64     bts_index;
-       u64     bts_absolute_maximum;
-       u64     bts_interrupt_threshold;
-       u64     pebs_buffer_base;
-       u64     pebs_index;
-       u64     pebs_absolute_maximum;
-       u64     pebs_interrupt_threshold;
-       u64     pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-/*
- * Per register state.
- */
-struct er_account {
-       raw_spinlock_t          lock;   /* per-core: protect structure */
-       u64                 config;     /* extra MSR config */
-       u64                 reg;        /* extra MSR number */
-       atomic_t            ref;        /* reference count */
-};
-
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-struct intel_shared_regs {
-       struct er_account       regs[EXTRA_REG_MAX];
-       int                     refcnt;         /* per-core: #HT threads */
-       unsigned                core_id;        /* per-core: core id */
-};
-
-enum intel_excl_state_type {
-       INTEL_EXCL_UNUSED    = 0, /* counter is unused */
-       INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
-       INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
-};
-
-struct intel_excl_states {
-       enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-       bool sched_started; /* true if scheduling has started */
-};
-
-struct intel_excl_cntrs {
-       raw_spinlock_t  lock;
-
-       struct intel_excl_states states[2];
-
-       union {
-               u16     has_exclusive[2];
-               u32     exclusive_present;
-       };
-
-       int             refcnt;         /* per-core: #HT threads */
-       unsigned        core_id;        /* per-core: core id */
-};
-
-#define MAX_LBR_ENTRIES                32
-
-enum {
-       X86_PERF_KFREE_SHARED = 0,
-       X86_PERF_KFREE_EXCL   = 1,
-       X86_PERF_KFREE_MAX
-};
-
-struct cpu_hw_events {
-       /*
-        * Generic x86 PMC bits
-        */
-       struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
-       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       int                     enabled;
-
-       int                     n_events; /* the # of events in the below arrays */
-       int                     n_added;  /* the # last events in the below arrays;
-                                            they've never been enabled yet */
-       int                     n_txn;    /* the # last events in the below arrays;
-                                            added in the current transaction */
-       int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
-       u64                     tags[X86_PMC_IDX_MAX];
-
-       struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
-
-       int                     n_excl; /* the number of exclusive events */
-
-       unsigned int            txn_flags;
-       int                     is_fake;
-
-       /*
-        * Intel DebugStore bits
-        */
-       struct debug_store      *ds;
-       u64                     pebs_enabled;
-
-       /*
-        * Intel LBR bits
-        */
-       int                             lbr_users;
-       void                            *lbr_context;
-       struct perf_branch_stack        lbr_stack;
-       struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
-       struct er_account               *lbr_sel;
-       u64                             br_sel;
-
-       /*
-        * Intel host/guest exclude bits
-        */
-       u64                             intel_ctrl_guest_mask;
-       u64                             intel_ctrl_host_mask;
-       struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
-
-       /*
-        * Intel checkpoint mask
-        */
-       u64                             intel_cp_status;
-
-       /*
-        * manage shared (per-core, per-cpu) registers
-        * used on Intel NHM/WSM/SNB
-        */
-       struct intel_shared_regs        *shared_regs;
-       /*
-        * manage exclusive counter access between hyperthread
-        */
-       struct event_constraint *constraint_list; /* in enable order */
-       struct intel_excl_cntrs         *excl_cntrs;
-       int excl_thread_id; /* 0 or 1 */
-
-       /*
-        * AMD specific bits
-        */
-       struct amd_nb                   *amd_nb;
-       /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
-       u64                             perf_ctr_virt_mask;
-
-       void                            *kfree_on_online[X86_PERF_KFREE_MAX];
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
-       { .idxmsk64 = (n) },            \
-       .code = (c),                    \
-       .cmask = (m),                   \
-       .weight = (w),                  \
-       .overlap = (o),                 \
-       .flags = f,                     \
-}
-
-#define EVENT_CONSTRAINT(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
-
-#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
-       __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
-                          0, PERF_X86_EVENT_EXCL)
-
-/*
- * The overlap flag marks event constraints with overlapping counter
- * masks. This is the case if the counter mask of such an event is not
- * a subset of any other counter mask of a constraint with an equal or
- * higher weight, e.g.:
- *
- *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
- *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
- *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
- *
- * The event scheduler may not select the correct counter in the first
- * cycle because it needs to know which subsequent events will be
- * scheduled. It may fail to schedule the events then. So we set the
- * overlap flag for such constraints to give the scheduler a hint which
- * events to select for counter rescheduling.
- *
- * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
- * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
- * and its counter masks must be kept at a minimum.
- */
-#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n)   \
-       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  - in_tx
- *  - in_tx_checkpointed
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
-#define FIXED_EVENT_CONSTRAINT(c, n)   \
-       EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n)  \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-/* Constraint on specific umask bit only + event */
-#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)      \
-       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
-
-/* Like UEVENT_CONSTRAINT, but match flags too */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)    \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
-                          HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
-
-#define INTEL_PLD_CONSTRAINT(c, n)     \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
-
-#define INTEL_PST_CONSTRAINT(c, n)     \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-
-/* Event constraint, but match on all event flags too. */
-#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-/* Check only flags, but allow all event/umask */
-#define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
-       EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
-
-/* Check flags and event code, and set the HSW store flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-/* Check flags and event code, and set the HSW load flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW store flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW load flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW N/A flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
-
-
-/*
- * We define the end marker as having a weight of -1
- * to enable blacklisting of events using a counter bitmask
- * of zero and thus a weight of zero.
- * The end marker has a weight that cannot possibly be
- * obtained from counting the bits in the bitmask.
- */
-#define EVENT_CONSTRAINT_END { .weight = -1 }
-
-/*
- * Check for end marker with weight == -1
- */
-#define for_each_event_constraint(e, c)        \
-       for ((e) = (c); (e)->weight != -1; (e)++)
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
-       unsigned int            event;
-       unsigned int            msr;
-       u64                     config_mask;
-       u64                     valid_mask;
-       int                     idx;  /* per_xxx->regs[] reg index */
-       bool                    extra_msr_access;
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) {     \
-       .event = (e),                   \
-       .msr = (ms),                    \
-       .config_mask = (m),             \
-       .valid_mask = (vm),             \
-       .idx = EXTRA_REG_##i,           \
-       .extra_msr_access = true,       \
-       }
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)     \
-       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
-       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
-                       ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
-
-#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
-       INTEL_UEVENT_EXTRA_REG(c, \
-                              MSR_PEBS_LD_LAT_THRESHOLD, \
-                              0xffff, \
-                              LDLAT)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
-       struct {
-               u64     lbr_format:6;
-               u64     pebs_trap:1;
-               u64     pebs_arch_reg:1;
-               u64     pebs_format:4;
-               u64     smm_freeze:1;
-               /*
-                * PMU supports separate counter range for writing
-                * values > 32bit.
-                */
-               u64     full_width_write:1;
-       };
-       u64     capabilities;
-};
-
-struct x86_pmu_quirk {
-       struct x86_pmu_quirk *next;
-       void (*func)(void);
-};
-
-union x86_pmu_config {
-       struct {
-               u64 event:8,
-                   umask:8,
-                   usr:1,
-                   os:1,
-                   edge:1,
-                   pc:1,
-                   interrupt:1,
-                   __reserved1:1,
-                   en:1,
-                   inv:1,
-                   cmask:8,
-                   event2:4,
-                   __reserved2:4,
-                   go:1,
-                   ho:1;
-       } bits;
-       u64 value;
-};
-
-#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
-
-enum {
-       x86_lbr_exclusive_lbr,
-       x86_lbr_exclusive_bts,
-       x86_lbr_exclusive_pt,
-       x86_lbr_exclusive_max,
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-       /*
-        * Generic x86 PMC bits
-        */
-       const char      *name;
-       int             version;
-       int             (*handle_irq)(struct pt_regs *);
-       void            (*disable_all)(void);
-       void            (*enable_all)(int added);
-       void            (*enable)(struct perf_event *);
-       void            (*disable)(struct perf_event *);
-       int             (*hw_config)(struct perf_event *event);
-       int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
-       unsigned        eventsel;
-       unsigned        perfctr;
-       int             (*addr_offset)(int index, bool eventsel);
-       int             (*rdpmc_index)(int index);
-       u64             (*event_map)(int);
-       int             max_events;
-       int             num_counters;
-       int             num_counters_fixed;
-       int             cntval_bits;
-       u64             cntval_mask;
-       union {
-                       unsigned long events_maskl;
-                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
-       };
-       int             events_mask_len;
-       int             apic;
-       u64             max_period;
-       struct event_constraint *
-                       (*get_event_constraints)(struct cpu_hw_events *cpuc,
-                                                int idx,
-                                                struct perf_event *event);
-
-       void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
-                                                struct perf_event *event);
-
-       void            (*start_scheduling)(struct cpu_hw_events *cpuc);
-
-       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
-       void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
-
-       struct event_constraint *event_constraints;
-       struct x86_pmu_quirk *quirks;
-       int             perfctr_second_write;
-       bool            late_ack;
-       unsigned        (*limit_period)(struct perf_event *event, unsigned l);
-
-       /*
-        * sysfs attrs
-        */
-       int             attr_rdpmc_broken;
-       int             attr_rdpmc;
-       struct attribute **format_attrs;
-       struct attribute **event_attrs;
-
-       ssize_t         (*events_sysfs_show)(char *page, u64 config);
-       struct attribute **cpu_events;
-
-       /*
-        * CPU Hotplug hooks
-        */
-       int             (*cpu_prepare)(int cpu);
-       void            (*cpu_starting)(int cpu);
-       void            (*cpu_dying)(int cpu);
-       void            (*cpu_dead)(int cpu);
-
-       void            (*check_microcode)(void);
-       void            (*sched_task)(struct perf_event_context *ctx,
-                                     bool sched_in);
-
-       /*
-        * Intel Arch Perfmon v2+
-        */
-       u64                     intel_ctrl;
-       union perf_capabilities intel_cap;
-
-       /*
-        * Intel DebugStore bits
-        */
-       unsigned int    bts             :1,
-                       bts_active      :1,
-                       pebs            :1,
-                       pebs_active     :1,
-                       pebs_broken     :1,
-                       pebs_prec_dist  :1;
-       int             pebs_record_size;
-       void            (*drain_pebs)(struct pt_regs *regs);
-       struct event_constraint *pebs_constraints;
-       void            (*pebs_aliases)(struct perf_event *event);
-       int             max_pebs_events;
-       unsigned long   free_running_flags;
-
-       /*
-        * Intel LBR
-        */
-       unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-       int             lbr_nr;                    /* hardware stack size */
-       u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
-       const int       *lbr_sel_map;              /* lbr_select mappings */
-       bool            lbr_double_abort;          /* duplicated lbr aborts */
-
-       /*
-        * Intel PT/LBR/BTS are exclusive
-        */
-       atomic_t        lbr_exclusive[x86_lbr_exclusive_max];
-
-       /*
-        * Extra registers for events
-        */
-       struct extra_reg *extra_regs;
-       unsigned int flags;
-
-       /*
-        * Intel host/guest support (KVM)
-        */
-       struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
-};
-
-struct x86_perf_task_context {
-       u64 lbr_from[MAX_LBR_ENTRIES];
-       u64 lbr_to[MAX_LBR_ENTRIES];
-       u64 lbr_info[MAX_LBR_ENTRIES];
-       int tos;
-       int lbr_callstack_users;
-       int lbr_stack_state;
-};
-
-#define x86_add_quirk(func_)                                           \
-do {                                                                   \
-       static struct x86_pmu_quirk __quirk __initdata = {              \
-               .func = func_,                                          \
-       };                                                              \
-       __quirk.next = x86_pmu.quirks;                                  \
-       x86_pmu.quirks = &__quirk;                                      \
-} while (0)
-
-/*
- * x86_pmu flags
- */
-#define PMU_FL_NO_HT_SHARING   0x1 /* no hyper-threading resource sharing */
-#define PMU_FL_HAS_RSP_1       0x2 /* has 2 equivalent offcore_rsp regs   */
-#define PMU_FL_EXCL_CNTRS      0x4 /* has exclusive counter requirements  */
-#define PMU_FL_EXCL_ENABLED    0x8 /* exclusive counter active */
-
-#define EVENT_VAR(_id)  event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-
-#define EVENT_ATTR(_name, _id)                                         \
-static struct perf_pmu_events_attr EVENT_VAR(_id) = {                  \
-       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
-       .id             = PERF_COUNT_HW_##_id,                          \
-       .event_str      = NULL,                                         \
-};
-
-#define EVENT_ATTR_STR(_name, v, str)                                  \
-static struct perf_pmu_events_attr event_attr_##v = {                  \
-       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
-       .id             = 0,                                            \
-       .event_str      = str,                                          \
-};
-
-extern struct x86_pmu x86_pmu __read_mostly;
-
-static inline bool x86_pmu_has_lbr_callstack(void)
-{
-       return  x86_pmu.lbr_sel_map &&
-               x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
-}
-
-DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-extern u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-extern u64 __read_mostly hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-u64 x86_perf_event_update(struct perf_event *event);
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
-       return x86_pmu.eventsel + (x86_pmu.addr_offset ?
-                                  x86_pmu.addr_offset(index, true) : index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
-       return x86_pmu.perfctr + (x86_pmu.addr_offset ?
-                                 x86_pmu.addr_offset(index, false) : index);
-}
-
-static inline int x86_pmu_rdpmc_index(int index)
-{
-       return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
-}
-
-int x86_add_exclusive(unsigned int what);
-
-void x86_del_exclusive(unsigned int what);
-
-int x86_reserve_hardware(void);
-
-void x86_release_hardware(void);
-
-void hw_perf_lbr_event_destroy(struct perf_event *event);
-
-int x86_setup_perfctr(struct perf_event *event);
-
-int x86_pmu_hw_config(struct perf_event *event);
-
-void x86_pmu_disable_all(void);
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-                                         u64 enable_mask)
-{
-       u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
-
-       if (hwc->extra_reg.reg)
-               wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-       wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
-}
-
-void x86_pmu_enable_all(int added);
-
-int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int gpmax, int *assign);
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
-
-void x86_pmu_stop(struct perf_event *event, int flags);
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-}
-
-void x86_pmu_enable_event(struct perf_event *event);
-
-int x86_pmu_handle_irq(struct pt_regs *regs);
-
-extern struct event_constraint emptyconstraint;
-
-extern struct event_constraint unconstrained;
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-       return ip > PAGE_OFFSET;
-#else
-       return (long)ip < 0;
-#endif
-}
-
-/*
- * Not all PMUs provide the right context information to place the reported IP
- * into full context. Specifically segment registers are typically not
- * supplied.
- *
- * Assuming the address is a linear address (it is for IBS), we fake the CS and
- * vm86 mode using the known zero-based code segment and 'fix up' the registers
- * to reflect this.
- *
- * Intel PEBS/LBR appear to typically provide the effective address, nothing
- * much we can do about that but pray and treat it like a linear address.
- */
-static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
-{
-       regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
-       if (regs->flags & X86_VM_MASK)
-               regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
-       regs->ip = ip;
-}
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
-ssize_t intel_event_sysfs_show(char *page, u64 config);
-
-struct attribute **merge_attr(struct attribute **a, struct attribute **b);
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-int amd_pmu_init(void);
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static inline int amd_pmu_init(void)
-{
-       return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_AMD */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-static inline bool intel_pmu_has_bts(struct perf_event *event)
-{
-       if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-           !event->attr.freq && event->hw.sample_period == 1)
-               return true;
-
-       return false;
-}
-
-int intel_pmu_save_and_restart(struct perf_event *event);
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event);
-
-struct intel_shared_regs *allocate_shared_regs(int cpu);
-
-int intel_pmu_init(void);
-
-void init_debug_store_on_cpu(int cpu);
-
-void fini_debug_store_on_cpu(int cpu);
-
-void release_ds_buffers(void);
-
-void reserve_ds_buffers(void);
-
-extern struct event_constraint bts_constraint;
-
-void intel_pmu_enable_bts(u64 config);
-
-void intel_pmu_disable_bts(void);
-
-int intel_pmu_drain_bts_buffer(void);
-
-extern struct event_constraint intel_core2_pebs_event_constraints[];
-
-extern struct event_constraint intel_atom_pebs_event_constraints[];
-
-extern struct event_constraint intel_slm_pebs_event_constraints[];
-
-extern struct event_constraint intel_nehalem_pebs_event_constraints[];
-
-extern struct event_constraint intel_westmere_pebs_event_constraints[];
-
-extern struct event_constraint intel_snb_pebs_event_constraints[];
-
-extern struct event_constraint intel_ivb_pebs_event_constraints[];
-
-extern struct event_constraint intel_hsw_pebs_event_constraints[];
-
-extern struct event_constraint intel_skl_pebs_event_constraints[];
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-
-void intel_pmu_pebs_enable(struct perf_event *event);
-
-void intel_pmu_pebs_disable(struct perf_event *event);
-
-void intel_pmu_pebs_enable_all(void);
-
-void intel_pmu_pebs_disable_all(void);
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_ds_init(void);
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_pmu_lbr_reset(void);
-
-void intel_pmu_lbr_enable(struct perf_event *event);
-
-void intel_pmu_lbr_disable(struct perf_event *event);
-
-void intel_pmu_lbr_enable_all(bool pmi);
-
-void intel_pmu_lbr_disable_all(void);
-
-void intel_pmu_lbr_read(void);
-
-void intel_pmu_lbr_init_core(void);
-
-void intel_pmu_lbr_init_nhm(void);
-
-void intel_pmu_lbr_init_atom(void);
-
-void intel_pmu_lbr_init_snb(void);
-
-void intel_pmu_lbr_init_hsw(void);
-
-void intel_pmu_lbr_init_skl(void);
-
-void intel_pmu_lbr_init_knl(void);
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event);
-
-void intel_pt_interrupt(void);
-
-int intel_bts_interrupt(void);
-
-void intel_bts_enable_local(void);
-
-void intel_bts_disable_local(void);
-
-int p4_pmu_init(void);
-
-int p6_pmu_init(void);
-
-int knc_pmu_init(void);
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-                         char *page);
-
-static inline int is_ht_workaround_enabled(void)
-{
-       return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static inline void reserve_ds_buffers(void)
-{
-}
-
-static inline void release_ds_buffers(void)
-{
-}
-
-static inline int intel_pmu_init(void)
-{
-       return 0;
-}
-
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-       return NULL;
-}
-
-static inline int is_ht_workaround_enabled(void)
-{
-       return 0;
-}
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c

deleted file mode 100644 (file)

index 5861053..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ /dev/null
@@ -1,731 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/apicdef.h>
-
-#include "perf_event.h"
-
-static __initconst const u64 amd_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
-               [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]                   = 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]                 = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]             = 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]                 = 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]          = 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]                        = 0x00c3,
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]      = 0x00d0, /* "Decoder empty" event */
-  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]       = 0x00d1, /* "Dispatch stalls" event */
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-       return amd_perfmon_event_map[hw_event];
-}
-
-/*
- * Previously calculated offsets
- */
-static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
-static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
-
-/*
- * Legacy CPUs:
- *   4 counters starting at 0xc0010000 each offset by 1
- *
- * CPUs with core performance counter extensions:
- *   6 counters starting at 0xc0010200 each offset by 2
- */
-static inline int amd_pmu_addr_offset(int index, bool eventsel)
-{
-       int offset;
-
-       if (!index)
-               return index;
-
-       if (eventsel)
-               offset = event_offsets[index];
-       else
-               offset = count_offsets[index];
-
-       if (offset)
-               return offset;
-
-       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               offset = index;
-       else
-               offset = index << 1;
-
-       if (eventsel)
-               event_offsets[index] = offset;
-       else
-               count_offsets[index] = offset;
-
-       return offset;
-}
-
-static int amd_core_hw_config(struct perf_event *event)
-{
-       if (event->attr.exclude_host && event->attr.exclude_guest)
-               /*
-                * When HO == GO == 1 the hardware treats that as GO == HO == 0
-                * and will count in both modes. We don't want to count in that
-                * case so we emulate no-counting by setting US = OS = 0.
-                */
-               event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
-                                     ARCH_PERFMON_EVENTSEL_OS);
-       else if (event->attr.exclude_host)
-               event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
-       else if (event->attr.exclude_guest)
-               event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
-
-       return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
-       return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
-}
-
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
-       return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
-       struct amd_nb *nb = cpuc->amd_nb;
-
-       return nb && nb->nb_id != -1;
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
-       int ret;
-
-       /* pass precise event sampling to ibs: */
-       if (event->attr.precise_ip && get_ibs_caps())
-               return -ENOENT;
-
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       ret = x86_pmu_hw_config(event);
-       if (ret)
-               return ret;
-
-       if (event->attr.type == PERF_TYPE_RAW)
-               event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
-       return amd_core_hw_config(event);
-}
-
-static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
-                                          struct perf_event *event)
-{
-       struct amd_nb *nb = cpuc->amd_nb;
-       int i;
-
-       /*
-        * need to scan whole list because event may not have
-        * been assigned during scheduling
-        *
-        * no race condition possible because event can only
-        * be removed on one CPU at a time AND PMU is disabled
-        * when we come here
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (cmpxchg(nb->owners + i, event, NULL) == event)
-                       break;
-       }
-}
-
- /*
-  * AMD64 NorthBridge events need special treatment because
-  * counter access needs to be synchronized across all cores
-  * of a package. Refer to BKDG section 3.12
-  *
-  * NB events are events measuring L3 cache, Hypertransport
-  * traffic. They are identified by an event code >= 0xe00.
-  * They measure events on the NorthBride which is shared
-  * by all cores on a package. NB events are counted on a
-  * shared set of counters. When a NB event is programmed
-  * in a counter, the data actually comes from a shared
-  * counter. Thus, access to those counters needs to be
-  * synchronized.
-  *
-  * We implement the synchronization such that no two cores
-  * can be measuring NB events using the same counters. Thus,
-  * we maintain a per-NB allocation table. The available slot
-  * is propagated using the event_constraint structure.
-  *
-  * We provide only one choice for each NB event based on
-  * the fact that only NB events have restrictions. Consequently,
-  * if a counter is available, there is a guarantee the NB event
-  * will be assigned to it. If no slot is available, an empty
-  * constraint is returned and scheduling will eventually fail
-  * for this event.
-  *
-  * Note that all cores attached the same NB compete for the same
-  * counters to host NB events, this is why we use atomic ops. Some
-  * multi-chip CPUs may have more than one NB.
-  *
-  * Given that resources are allocated (cmpxchg), they must be
-  * eventually freed for others to use. This is accomplished by
-  * calling __amd_put_nb_event_constraints()
-  *
-  * Non NB events are not impacted by this restriction.
-  */
-static struct event_constraint *
-__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-                              struct event_constraint *c)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct amd_nb *nb = cpuc->amd_nb;
-       struct perf_event *old;
-       int idx, new = -1;
-
-       if (!c)
-               c = &unconstrained;
-
-       if (cpuc->is_fake)
-               return c;
-
-       /*
-        * detect if already present, if so reuse
-        *
-        * cannot merge with actual allocation
-        * because of possible holes
-        *
-        * event can already be present yet not assigned (in hwc->idx)
-        * because of successive calls to x86_schedule_events() from
-        * hw_perf_group_sched_in() without hw_perf_enable()
-        */
-       for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
-               if (new == -1 || hwc->idx == idx)
-                       /* assign free slot, prefer hwc->idx */
-                       old = cmpxchg(nb->owners + idx, NULL, event);
-               else if (nb->owners[idx] == event)
-                       /* event already present */
-                       old = event;
-               else
-                       continue;
-
-               if (old && old != event)
-                       continue;
-
-               /* reassign to this slot */
-               if (new != -1)
-                       cmpxchg(nb->owners + new, event, NULL);
-               new = idx;
-
-               /* already present, reuse */
-               if (old == event)
-                       break;
-       }
-
-       if (new == -1)
-               return &emptyconstraint;
-
-       return &nb->event_constraints[new];
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu)
-{
-       struct amd_nb *nb;
-       int i;
-
-       nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
-       if (!nb)
-               return NULL;
-
-       nb->nb_id = -1;
-
-       /*
-        * initialize all possible NB constraints
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               __set_bit(i, nb->event_constraints[i].idxmsk);
-               nb->event_constraints[i].weight = 1;
-       }
-       return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       WARN_ON_ONCE(cpuc->amd_nb);
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return NOTIFY_OK;
-
-       cpuc->amd_nb = amd_alloc_nb(cpu);
-       if (!cpuc->amd_nb)
-               return NOTIFY_BAD;
-
-       return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-       struct amd_nb *nb;
-       int i, nb_id;
-
-       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return;
-
-       nb_id = amd_get_nb_id(cpu);
-       WARN_ON_ONCE(nb_id == BAD_APICID);
-
-       for_each_online_cpu(i) {
-               nb = per_cpu(cpu_hw_events, i).amd_nb;
-               if (WARN_ON_ONCE(!nb))
-                       continue;
-
-               if (nb->nb_id == nb_id) {
-                       *onln = cpuc->amd_nb;
-                       cpuc->amd_nb = nb;
-                       break;
-               }
-       }
-
-       cpuc->amd_nb->nb_id = nb_id;
-       cpuc->amd_nb->refcnt++;
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
-       struct cpu_hw_events *cpuhw;
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return;
-
-       cpuhw = &per_cpu(cpu_hw_events, cpu);
-
-       if (cpuhw->amd_nb) {
-               struct amd_nb *nb = cpuhw->amd_nb;
-
-               if (nb->nb_id == -1 || --nb->refcnt == 0)
-                       kfree(nb);
-
-               cpuhw->amd_nb = NULL;
-       }
-}
-
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       /*
-        * if not NB event or no NB, then no constraints
-        */
-       if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
-               return &unconstrained;
-
-       return __amd_get_nb_event_constraints(cpuc, event, NULL);
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
-                                     struct perf_event *event)
-{
-       if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
-               __amd_put_nb_event_constraints(cpuc, event);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *amd_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-/* AMD Family 15h */
-
-#define AMD_EVENT_TYPE_MASK    0x000000F0ULL
-
-#define AMD_EVENT_FP           0x00000000ULL ... 0x00000010ULL
-#define AMD_EVENT_LS           0x00000020ULL ... 0x00000030ULL
-#define AMD_EVENT_DC           0x00000040ULL ... 0x00000050ULL
-#define AMD_EVENT_CU           0x00000060ULL ... 0x00000070ULL
-#define AMD_EVENT_IC_DE                0x00000080ULL ... 0x00000090ULL
-#define AMD_EVENT_EX_LS                0x000000C0ULL
-#define AMD_EVENT_DE           0x000000D0ULL
-#define AMD_EVENT_NB           0x000000E0ULL ... 0x000000F0ULL
-
-/*
- * AMD family 15h event code/PMC mappings:
- *
- * type = event_code & 0x0F0:
- *
- * 0x000       FP      PERF_CTL[5:3]
- * 0x010       FP      PERF_CTL[5:3]
- * 0x020       LS      PERF_CTL[5:0]
- * 0x030       LS      PERF_CTL[5:0]
- * 0x040       DC      PERF_CTL[5:0]
- * 0x050       DC      PERF_CTL[5:0]
- * 0x060       CU      PERF_CTL[2:0]
- * 0x070       CU      PERF_CTL[2:0]
- * 0x080       IC/DE   PERF_CTL[2:0]
- * 0x090       IC/DE   PERF_CTL[2:0]
- * 0x0A0       ---
- * 0x0B0       ---
- * 0x0C0       EX/LS   PERF_CTL[5:0]
- * 0x0D0       DE      PERF_CTL[2:0]
- * 0x0E0       NB      NB_PERF_CTL[3:0]
- * 0x0F0       NB      NB_PERF_CTL[3:0]
- *
- * Exceptions:
- *
- * 0x000       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x003       FP      PERF_CTL[3]
- * 0x004       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x00B       FP      PERF_CTL[3]
- * 0x00D       FP      PERF_CTL[3]
- * 0x023       DE      PERF_CTL[2:0]
- * 0x02D       LS      PERF_CTL[3]
- * 0x02E       LS      PERF_CTL[3,0]
- * 0x031       LS      PERF_CTL[2:0] (**)
- * 0x043       CU      PERF_CTL[2:0]
- * 0x045       CU      PERF_CTL[2:0]
- * 0x046       CU      PERF_CTL[2:0]
- * 0x054       CU      PERF_CTL[2:0]
- * 0x055       CU      PERF_CTL[2:0]
- * 0x08F       IC      PERF_CTL[0]
- * 0x187       DE      PERF_CTL[0]
- * 0x188       DE      PERF_CTL[0]
- * 0x0DB       EX      PERF_CTL[5:0]
- * 0x0DC       LS      PERF_CTL[5:0]
- * 0x0DD       LS      PERF_CTL[5:0]
- * 0x0DE       LS      PERF_CTL[5:0]
- * 0x0DF       LS      PERF_CTL[5:0]
- * 0x1C0       EX      PERF_CTL[5:3]
- * 0x1D6       EX      PERF_CTL[5:0]
- * 0x1D8       EX      PERF_CTL[5:0]
- *
- * (*)  depending on the umask all FPU counters may be used
- * (**) only one unitmask enabled at a time
- */
-
-static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
-static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
-static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
-static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
-static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
-
-static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
-                              struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned int event_code = amd_get_event_code(hwc);
-
-       switch (event_code & AMD_EVENT_TYPE_MASK) {
-       case AMD_EVENT_FP:
-               switch (event_code) {
-               case 0x000:
-                       if (!(hwc->config & 0x0000F000ULL))
-                               break;
-                       if (!(hwc->config & 0x00000F00ULL))
-                               break;
-                       return &amd_f15_PMC3;
-               case 0x004:
-                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-                               break;
-                       return &amd_f15_PMC3;
-               case 0x003:
-               case 0x00B:
-               case 0x00D:
-                       return &amd_f15_PMC3;
-               }
-               return &amd_f15_PMC53;
-       case AMD_EVENT_LS:
-       case AMD_EVENT_DC:
-       case AMD_EVENT_EX_LS:
-               switch (event_code) {
-               case 0x023:
-               case 0x043:
-               case 0x045:
-               case 0x046:
-               case 0x054:
-               case 0x055:
-                       return &amd_f15_PMC20;
-               case 0x02D:
-                       return &amd_f15_PMC3;
-               case 0x02E:
-                       return &amd_f15_PMC30;
-               case 0x031:
-                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-                               return &amd_f15_PMC20;
-                       return &emptyconstraint;
-               case 0x1C0:
-                       return &amd_f15_PMC53;
-               default:
-                       return &amd_f15_PMC50;
-               }
-       case AMD_EVENT_CU:
-       case AMD_EVENT_IC_DE:
-       case AMD_EVENT_DE:
-               switch (event_code) {
-               case 0x08F:
-               case 0x187:
-               case 0x188:
-                       return &amd_f15_PMC0;
-               case 0x0DB ... 0x0DF:
-               case 0x1D6:
-               case 0x1D8:
-                       return &amd_f15_PMC50;
-               default:
-                       return &amd_f15_PMC20;
-               }
-       case AMD_EVENT_NB:
-               /* moved to perf_event_amd_uncore.c */
-               return &emptyconstraint;
-       default:
-               return &emptyconstraint;
-       }
-}
-
-static ssize_t amd_event_sysfs_show(char *page, u64 config)
-{
-       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
-                   (config & AMD64_EVENTSEL_EVENT) >> 24;
-
-       return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
-       .name                   = "AMD",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = x86_pmu_disable_all,
-       .enable_all             = x86_pmu_enable_all,
-       .enable                 = x86_pmu_enable_event,
-       .disable                = x86_pmu_disable_event,
-       .hw_config              = amd_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_K7_EVNTSEL0,
-       .perfctr                = MSR_K7_PERFCTR0,
-       .addr_offset            = amd_pmu_addr_offset,
-       .event_map              = amd_pmu_event_map,
-       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
-       .num_counters           = AMD64_NUM_COUNTERS,
-       .cntval_bits            = 48,
-       .cntval_mask            = (1ULL << 48) - 1,
-       .apic                   = 1,
-       /* use highest bit to detect overflow */
-       .max_period             = (1ULL << 47) - 1,
-       .get_event_constraints  = amd_get_event_constraints,
-       .put_event_constraints  = amd_put_event_constraints,
-
-       .format_attrs           = amd_format_attr,
-       .events_sysfs_show      = amd_event_sysfs_show,
-
-       .cpu_prepare            = amd_pmu_cpu_prepare,
-       .cpu_starting           = amd_pmu_cpu_starting,
-       .cpu_dead               = amd_pmu_cpu_dead,
-};
-
-static int __init amd_core_pmu_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               return 0;
-
-       switch (boot_cpu_data.x86) {
-       case 0x15:
-               pr_cont("Fam15h ");
-               x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
-               break;
-
-       default:
-               pr_err("core perfctr but no constraints; unknown hardware!\n");
-               return -ENODEV;
-       }
-
-       /*
-        * If core performance counter extensions exists, we must use
-        * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
-        * amd_pmu_addr_offset().
-        */
-       x86_pmu.eventsel        = MSR_F15H_PERF_CTL;
-       x86_pmu.perfctr         = MSR_F15H_PERF_CTR;
-       x86_pmu.num_counters    = AMD64_NUM_COUNTERS_CORE;
-
-       pr_cont("core perfctr, ");
-       return 0;
-}
-
-__init int amd_pmu_init(void)
-{
-       int ret;
-
-       /* Performance-monitoring supported from K7 and later: */
-       if (boot_cpu_data.x86 < 6)
-               return -ENODEV;
-
-       x86_pmu = amd_pmu;
-
-       ret = amd_core_pmu_init();
-       if (ret)
-               return ret;
-
-       /* Events are common for all AMDs */
-       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-              sizeof(hw_cache_event_ids));
-
-       return 0;
-}
-
-void amd_pmu_enable_virt(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       cpuc->perf_ctr_virt_mask = 0;
-
-       /* Reload all events */
-       x86_pmu_disable_all();
-       x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
-
-void amd_pmu_disable_virt(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * We only mask out the Host-only bit so that host-only counting works
-        * when SVM is disabled. If someone sets up a guest-only counter when
-        * SVM is disabled the Guest-only bits still gets set and the counter
-        * will not count anything.
-        */
-       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-       /* Reload all events */
-       x86_pmu_disable_all();
-       x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c

deleted file mode 100644 (file)

index 989d3c2..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ /dev/null
@@ -1,959 +0,0 @@
-/*
- * Performance events - AMD IBS
- *
- *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ptrace.h>
-#include <linux/syscore_ops.h>
-
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-static u32 ibs_caps;
-
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-
-#include <linux/kprobes.h>
-#include <linux/hardirq.h>
-
-#include <asm/nmi.h>
-
-#define IBS_FETCH_CONFIG_MASK  (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK     IBS_OP_MAX_CNT
-
-enum ibs_states {
-       IBS_ENABLED     = 0,
-       IBS_STARTED     = 1,
-       IBS_STOPPING    = 2,
-
-       IBS_MAX_STATES,
-};
-
-struct cpu_perf_ibs {
-       struct perf_event       *event;
-       unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
-};
-
-struct perf_ibs {
-       struct pmu                      pmu;
-       unsigned int                    msr;
-       u64                             config_mask;
-       u64                             cnt_mask;
-       u64                             enable_mask;
-       u64                             valid_mask;
-       u64                             max_period;
-       unsigned long                   offset_mask[1];
-       int                             offset_max;
-       struct cpu_perf_ibs __percpu    *pcpu;
-
-       struct attribute                **format_attrs;
-       struct attribute_group          format_group;
-       const struct attribute_group    *attr_groups[2];
-
-       u64                             (*get_count)(u64 config);
-};
-
-struct perf_ibs_data {
-       u32             size;
-       union {
-               u32     data[0];        /* data buffer starts here */
-               u32     caps;
-       };
-       u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
-static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
-{
-       s64 left = local64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int overflow = 0;
-
-       /*
-        * If we are way outside a reasonable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               overflow = 1;
-       }
-
-       if (unlikely(left < (s64)min)) {
-               left += period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               overflow = 1;
-       }
-
-       /*
-        * If the hw period that triggers the sw overflow is too short
-        * we might hit the irq handler. This biases the results.
-        * Thus we shorten the next-to-last period and set the last
-        * period to the max period.
-        */
-       if (left > max) {
-               left -= max;
-               if (left > max)
-                       left = max;
-               else if (left < min)
-                       left = min;
-       }
-
-       *hw_period = (u64)left;
-
-       return overflow;
-}
-
-static  int
-perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int shift = 64 - width;
-       u64 prev_raw_count;
-       u64 delta;
-
-       /*
-        * Careful: an NMI might modify the previous event value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic event atomically:
-        */
-       prev_raw_count = local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               return 0;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-       local64_sub(delta, &hwc->period_left);
-
-       return 1;
-}
-
-static struct perf_ibs perf_ibs_fetch;
-static struct perf_ibs perf_ibs_op;
-
-static struct perf_ibs *get_ibs_pmu(int type)
-{
-       if (perf_ibs_fetch.pmu.type == type)
-               return &perf_ibs_fetch;
-       if (perf_ibs_op.pmu.type == type)
-               return &perf_ibs_op;
-       return NULL;
-}
-
-/*
- * Use IBS for precise event sampling:
- *
- *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
- *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
- *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
- *
- * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
- * MSRC001_1033) is used to select either cycle or micro-ops counting
- * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
- */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
-{
-       switch (event->attr.precise_ip) {
-       case 0:
-               return -ENOENT;
-       case 1:
-       case 2:
-               break;
-       default:
-               return -EOPNOTSUPP;
-       }
-
-       switch (event->attr.type) {
-       case PERF_TYPE_HARDWARE:
-               switch (event->attr.config) {
-               case PERF_COUNT_HW_CPU_CYCLES:
-                       *config = 0;
-                       return 0;
-               }
-               break;
-       case PERF_TYPE_RAW:
-               switch (event->attr.config) {
-               case 0x0076:
-                       *config = 0;
-                       return 0;
-               case 0x00C1:
-                       *config = IBS_OP_CNT_CTL;
-                       return 0;
-               }
-               break;
-       default:
-               return -ENOENT;
-       }
-
-       return -EOPNOTSUPP;
-}
-
-static const struct perf_event_attr ibs_notsupp = {
-       .exclude_user   = 1,
-       .exclude_kernel = 1,
-       .exclude_hv     = 1,
-       .exclude_idle   = 1,
-       .exclude_host   = 1,
-       .exclude_guest  = 1,
-};
-
-static int perf_ibs_init(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs;
-       u64 max_cnt, config;
-       int ret;
-
-       perf_ibs = get_ibs_pmu(event->attr.type);
-       if (perf_ibs) {
-               config = event->attr.config;
-       } else {
-               perf_ibs = &perf_ibs_op;
-               ret = perf_ibs_precise_event(event, &config);
-               if (ret)
-                       return ret;
-       }
-
-       if (event->pmu != &perf_ibs->pmu)
-               return -ENOENT;
-
-       if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
-               return -EINVAL;
-
-       if (config & ~perf_ibs->config_mask)
-               return -EINVAL;
-
-       if (hwc->sample_period) {
-               if (config & perf_ibs->cnt_mask)
-                       /* raw max_cnt may not be set */
-                       return -EINVAL;
-               if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
-                       /*
-                        * lower 4 bits can not be set in ibs max cnt,
-                        * but allowing it in case we adjust the
-                        * sample period to set a frequency.
-                        */
-                       return -EINVAL;
-               hwc->sample_period &= ~0x0FULL;
-               if (!hwc->sample_period)
-                       hwc->sample_period = 0x10;
-       } else {
-               max_cnt = config & perf_ibs->cnt_mask;
-               config &= ~perf_ibs->cnt_mask;
-               event->attr.sample_period = max_cnt << 4;
-               hwc->sample_period = event->attr.sample_period;
-       }
-
-       if (!hwc->sample_period)
-               return -EINVAL;
-
-       /*
-        * If we modify hwc->sample_period, we also need to update
-        * hwc->last_period and hwc->period_left.
-        */
-       hwc->last_period = hwc->sample_period;
-       local64_set(&hwc->period_left, hwc->sample_period);
-
-       hwc->config_base = perf_ibs->msr;
-       hwc->config = config;
-
-       return 0;
-}
-
-static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
-                              struct hw_perf_event *hwc, u64 *period)
-{
-       int overflow;
-
-       /* ignore lower 4 bits in min count: */
-       overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
-       local64_set(&hwc->prev_count, 0);
-
-       return overflow;
-}
-
-static u64 get_ibs_fetch_count(u64 config)
-{
-       return (config & IBS_FETCH_CNT) >> 12;
-}
-
-static u64 get_ibs_op_count(u64 config)
-{
-       u64 count = 0;
-
-       if (config & IBS_OP_VAL)
-               count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
-       if (ibs_caps & IBS_CAPS_RDWROPCNT)
-               count += (config & IBS_OP_CUR_CNT) >> 32;
-
-       return count;
-}
-
-static void
-perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
-                     u64 *config)
-{
-       u64 count = perf_ibs->get_count(*config);
-
-       /*
-        * Set width to 64 since we do not overflow on max width but
-        * instead on max count. In perf_ibs_set_period() we clear
-        * prev count manually on overflow.
-        */
-       while (!perf_event_try_update(event, count, 64)) {
-               rdmsrl(event->hw.config_base, *config);
-               count = perf_ibs->get_count(*config);
-       }
-}
-
-static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
-                                        struct hw_perf_event *hwc, u64 config)
-{
-       wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
-}
-
-/*
- * Erratum #420 Instruction-Based Sampling Engine May Generate
- * Interrupt that Cannot Be Cleared:
- *
- * Must clear counter mask first, then clear the enable bit. See
- * Revision Guide for AMD Family 10h Processors, Publication #41322.
- */
-static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
-                                         struct hw_perf_event *hwc, u64 config)
-{
-       config &= ~perf_ibs->cnt_mask;
-       wrmsrl(hwc->config_base, config);
-       config &= ~perf_ibs->enable_mask;
-       wrmsrl(hwc->config_base, config);
-}
-
-/*
- * We cannot restore the ibs pmu state, so we always needs to update
- * the event while stopping it and then reset the state when starting
- * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
- * perf_ibs_start()/perf_ibs_stop() and instead always do it.
- */
-static void perf_ibs_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       u64 period;
-
-       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-               return;
-
-       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-       hwc->state = 0;
-
-       perf_ibs_set_period(perf_ibs, hwc, &period);
-       set_bit(IBS_STARTED, pcpu->state);
-       perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-       perf_event_update_userpage(event);
-}
-
-static void perf_ibs_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       u64 config;
-       int stopping;
-
-       stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
-
-       if (!stopping && (hwc->state & PERF_HES_UPTODATE))
-               return;
-
-       rdmsrl(hwc->config_base, config);
-
-       if (stopping) {
-               set_bit(IBS_STOPPING, pcpu->state);
-               perf_ibs_disable_event(perf_ibs, hwc, config);
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       /*
-        * Clear valid bit to not count rollovers on update, rollovers
-        * are only updated in the irq handler.
-        */
-       config &= ~perf_ibs->valid_mask;
-
-       perf_ibs_event_update(perf_ibs, event, &config);
-       hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_ibs_add(struct perf_event *event, int flags)
-{
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-       if (test_and_set_bit(IBS_ENABLED, pcpu->state))
-               return -ENOSPC;
-
-       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       pcpu->event = event;
-
-       if (flags & PERF_EF_START)
-               perf_ibs_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void perf_ibs_del(struct perf_event *event, int flags)
-{
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-       if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
-               return;
-
-       perf_ibs_stop(event, PERF_EF_UPDATE);
-
-       pcpu->event = NULL;
-
-       perf_event_update_userpage(event);
-}
-
-static void perf_ibs_read(struct perf_event *event) { }
-
-PMU_FORMAT_ATTR(rand_en,       "config:57");
-PMU_FORMAT_ATTR(cnt_ctl,       "config:19");
-
-static struct attribute *ibs_fetch_format_attrs[] = {
-       &format_attr_rand_en.attr,
-       NULL,
-};
-
-static struct attribute *ibs_op_format_attrs[] = {
-       NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
-       NULL,
-};
-
-static struct perf_ibs perf_ibs_fetch = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-
-               .event_init     = perf_ibs_init,
-               .add            = perf_ibs_add,
-               .del            = perf_ibs_del,
-               .start          = perf_ibs_start,
-               .stop           = perf_ibs_stop,
-               .read           = perf_ibs_read,
-       },
-       .msr                    = MSR_AMD64_IBSFETCHCTL,
-       .config_mask            = IBS_FETCH_CONFIG_MASK,
-       .cnt_mask               = IBS_FETCH_MAX_CNT,
-       .enable_mask            = IBS_FETCH_ENABLE,
-       .valid_mask             = IBS_FETCH_VAL,
-       .max_period             = IBS_FETCH_MAX_CNT << 4,
-       .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
-       .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
-       .format_attrs           = ibs_fetch_format_attrs,
-
-       .get_count              = get_ibs_fetch_count,
-};
-
-static struct perf_ibs perf_ibs_op = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-
-               .event_init     = perf_ibs_init,
-               .add            = perf_ibs_add,
-               .del            = perf_ibs_del,
-               .start          = perf_ibs_start,
-               .stop           = perf_ibs_stop,
-               .read           = perf_ibs_read,
-       },
-       .msr                    = MSR_AMD64_IBSOPCTL,
-       .config_mask            = IBS_OP_CONFIG_MASK,
-       .cnt_mask               = IBS_OP_MAX_CNT,
-       .enable_mask            = IBS_OP_ENABLE,
-       .valid_mask             = IBS_OP_VAL,
-       .max_period             = IBS_OP_MAX_CNT << 4,
-       .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
-       .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
-       .format_attrs           = ibs_op_format_attrs,
-
-       .get_count              = get_ibs_op_count,
-};
-
-static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
-{
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       struct perf_event *event = pcpu->event;
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_sample_data data;
-       struct perf_raw_record raw;
-       struct pt_regs regs;
-       struct perf_ibs_data ibs_data;
-       int offset, size, check_rip, offset_max, throttle = 0;
-       unsigned int msr;
-       u64 *buf, *config, period;
-
-       if (!test_bit(IBS_STARTED, pcpu->state)) {
-               /*
-                * Catch spurious interrupts after stopping IBS: After
-                * disabling IBS there could be still incoming NMIs
-                * with samples that even have the valid bit cleared.
-                * Mark all this NMIs as handled.
-                */
-               return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
-       }
-
-       msr = hwc->config_base;
-       buf = ibs_data.regs;
-       rdmsrl(msr, *buf);
-       if (!(*buf++ & perf_ibs->valid_mask))
-               return 0;
-
-       config = &ibs_data.regs[0];
-       perf_ibs_event_update(perf_ibs, event, config);
-       perf_sample_data_init(&data, 0, hwc->last_period);
-       if (!perf_ibs_set_period(perf_ibs, hwc, &period))
-               goto out;       /* no sw counter overflow */
-
-       ibs_data.caps = ibs_caps;
-       size = 1;
-       offset = 1;
-       check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
-       if (event->attr.sample_type & PERF_SAMPLE_RAW)
-               offset_max = perf_ibs->offset_max;
-       else if (check_rip)
-               offset_max = 2;
-       else
-               offset_max = 1;
-       do {
-               rdmsrl(msr + offset, *buf++);
-               size++;
-               offset = find_next_bit(perf_ibs->offset_mask,
-                                      perf_ibs->offset_max,
-                                      offset + 1);
-       } while (offset < offset_max);
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               /*
-                * Read IbsBrTarget and IbsOpData4 separately
-                * depending on their availability.
-                * Can't add to offset_max as they are staggered
-                */
-               if (ibs_caps & IBS_CAPS_BRNTRGT) {
-                       rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
-                       size++;
-               }
-               if (ibs_caps & IBS_CAPS_OPDATA4) {
-                       rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
-                       size++;
-               }
-       }
-       ibs_data.size = sizeof(u64) * size;
-
-       regs = *iregs;
-       if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
-               regs.flags &= ~PERF_EFLAGS_EXACT;
-       } else {
-               set_linear_ip(&regs, ibs_data.regs[1]);
-               regs.flags |= PERF_EFLAGS_EXACT;
-       }
-
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               raw.size = sizeof(u32) + ibs_data.size;
-               raw.data = ibs_data.data;
-               data.raw = &raw;
-       }
-
-       throttle = perf_event_overflow(event, &data, &regs);
-out:
-       if (throttle)
-               perf_ibs_disable_event(perf_ibs, hwc, *config);
-       else
-               perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-       perf_event_update_userpage(event);
-
-       return 1;
-}
-
-static int
-perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-       int handled = 0;
-
-       handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
-       handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
-
-static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
-{
-       struct cpu_perf_ibs __percpu *pcpu;
-       int ret;
-
-       pcpu = alloc_percpu(struct cpu_perf_ibs);
-       if (!pcpu)
-               return -ENOMEM;
-
-       perf_ibs->pcpu = pcpu;
-
-       /* register attributes */
-       if (perf_ibs->format_attrs[0]) {
-               memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
-               perf_ibs->format_group.name     = "format";
-               perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
-
-               memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
-               perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
-               perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
-       }
-
-       ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
-       if (ret) {
-               perf_ibs->pcpu = NULL;
-               free_percpu(pcpu);
-       }
-
-       return ret;
-}
-
-static __init int perf_event_ibs_init(void)
-{
-       struct attribute **attr = ibs_op_format_attrs;
-
-       if (!ibs_caps)
-               return -ENODEV; /* ibs not supported by the cpu */
-
-       perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
-
-       if (ibs_caps & IBS_CAPS_OPCNT) {
-               perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
-               *attr++ = &format_attr_cnt_ctl.attr;
-       }
-       perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-
-       register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-       printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-
-       return 0;
-}
-
-#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-
-static __init int perf_event_ibs_init(void) { return 0; }
-
-#endif
-
-/* IBS - apic initialization, for perf and oprofile */
-
-static __init u32 __get_ibs_caps(void)
-{
-       u32 caps;
-       unsigned int max_level;
-
-       if (!boot_cpu_has(X86_FEATURE_IBS))
-               return 0;
-
-       /* check IBS cpuid feature flags */
-       max_level = cpuid_eax(0x80000000);
-       if (max_level < IBS_CPUID_FEATURES)
-               return IBS_CAPS_DEFAULT;
-
-       caps = cpuid_eax(IBS_CPUID_FEATURES);
-       if (!(caps & IBS_CAPS_AVAIL))
-               /* cpuid flags not valid */
-               return IBS_CAPS_DEFAULT;
-
-       return caps;
-}
-
-u32 get_ibs_caps(void)
-{
-       return ibs_caps;
-}
-
-EXPORT_SYMBOL(get_ibs_caps);
-
-static inline int get_eilvt(int offset)
-{
-       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
-       return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-/*
- * Check and reserve APIC extended interrupt LVT offset for IBS if available.
- */
-static inline int ibs_eilvt_valid(void)
-{
-       int offset;
-       u64 val;
-       int valid = 0;
-
-       preempt_disable();
-
-       rdmsrl(MSR_AMD64_IBSCTL, val);
-       offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-               pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
-                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-               goto out;
-       }
-
-       if (!get_eilvt(offset)) {
-               pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
-                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-               goto out;
-       }
-
-       valid = 1;
-out:
-       preempt_enable();
-
-       return valid;
-}
-
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
-       struct pci_dev *cpu_cfg;
-       int nodes;
-       u32 value = 0;
-
-       nodes = 0;
-       cpu_cfg = NULL;
-       do {
-               cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
-                                        PCI_DEVICE_ID_AMD_10H_NB_MISC,
-                                        cpu_cfg);
-               if (!cpu_cfg)
-                       break;
-               ++nodes;
-               pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-                                      | IBSCTL_LVT_OFFSET_VALID);
-               pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
-                       pci_dev_put(cpu_cfg);
-                       printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-                              "IBSCTL = 0x%08x\n", value);
-                       return -EINVAL;
-               }
-       } while (1);
-
-       if (!nodes) {
-               printk(KERN_DEBUG "No CPU node configured for IBS\n");
-               return -ENODEV;
-       }
-
-       return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
- * is using the new offset.
- */
-static void force_ibs_eilvt_setup(void)
-{
-       int offset;
-       int ret;
-
-       preempt_disable();
-       /* find the next free available EILVT entry, skip offset 0 */
-       for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
-               if (get_eilvt(offset))
-                       break;
-       }
-       preempt_enable();
-
-       if (offset == APIC_EILVT_NR_MAX) {
-               printk(KERN_DEBUG "No EILVT entry available\n");
-               return;
-       }
-
-       ret = setup_ibs_ctl(offset);
-       if (ret)
-               goto out;
-
-       if (!ibs_eilvt_valid())
-               goto out;
-
-       pr_info("IBS: LVT offset %d assigned\n", offset);
-
-       return;
-out:
-       preempt_disable();
-       put_eilvt(offset);
-       preempt_enable();
-       return;
-}
-
-static void ibs_eilvt_setup(void)
-{
-       /*
-        * Force LVT offset assignment for family 10h: The offsets are
-        * not assigned by the BIOS for this family, so the OS is
-        * responsible for doing it. If the OS assignment fails, fall
-        * back to BIOS settings and try to setup this.
-        */
-       if (boot_cpu_data.x86 == 0x10)
-               force_ibs_eilvt_setup();
-}
-
-static inline int get_ibs_lvt_offset(void)
-{
-       u64 val;
-
-       rdmsrl(MSR_AMD64_IBSCTL, val);
-       if (!(val & IBSCTL_LVT_OFFSET_VALID))
-               return -EINVAL;
-
-       return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void *dummy)
-{
-       int offset;
-
-       offset = get_ibs_lvt_offset();
-       if (offset < 0)
-               goto failed;
-
-       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
-               return;
-failed:
-       pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
-               smp_processor_id());
-}
-
-static void clear_APIC_ibs(void *dummy)
-{
-       int offset;
-
-       offset = get_ibs_lvt_offset();
-       if (offset >= 0)
-               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
-#ifdef CONFIG_PM
-
-static int perf_ibs_suspend(void)
-{
-       clear_APIC_ibs(NULL);
-       return 0;
-}
-
-static void perf_ibs_resume(void)
-{
-       ibs_eilvt_setup();
-       setup_APIC_ibs(NULL);
-}
-
-static struct syscore_ops perf_ibs_syscore_ops = {
-       .resume         = perf_ibs_resume,
-       .suspend        = perf_ibs_suspend,
-};
-
-static void perf_ibs_pm_init(void)
-{
-       register_syscore_ops(&perf_ibs_syscore_ops);
-}
-
-#else
-
-static inline void perf_ibs_pm_init(void) { }
-
-#endif
-
-static int
-perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_STARTING:
-               setup_APIC_ibs(NULL);
-               break;
-       case CPU_DYING:
-               clear_APIC_ibs(NULL);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static __init int amd_ibs_init(void)
-{
-       u32 caps;
-       int ret = -EINVAL;
-
-       caps = __get_ibs_caps();
-       if (!caps)
-               return -ENODEV; /* ibs not supported by the cpu */
-
-       ibs_eilvt_setup();
-
-       if (!ibs_eilvt_valid())
-               goto out;
-
-       perf_ibs_pm_init();
-       cpu_notifier_register_begin();
-       ibs_caps = caps;
-       /* make ibs_caps visible to other cpus: */
-       smp_mb();
-       smp_call_function(setup_APIC_ibs, NULL, 1);
-       __perf_cpu_notifier(perf_ibs_cpu_notifier);
-       cpu_notifier_register_done();
-
-       ret = perf_event_ibs_init();
-out:
-       if (ret)
-               pr_err("Failed to setup IBS, %d\n", ret);
-       return ret;
-}
-
-/* Since we need the pci subsystem to init ibs we can't do this earlier: */
-device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c

deleted file mode 100644 (file)

index 97242a9..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
-
-#define COUNTER_SHIFT          16
-
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
-
-static struct perf_amd_iommu __perf_iommu;
-
-struct perf_amd_iommu {
-       struct pmu pmu;
-       u8 max_banks;
-       u8 max_counters;
-       u64 cntr_assign_mask;
-       raw_spinlock_t lock;
-       const struct attribute_group *attr_groups[4];
-};
-
-#define format_group   attr_groups[0]
-#define cpumask_group  attr_groups[1]
-#define events_group   attr_groups[2]
-#define null_group     attr_groups[3]
-
-/*---------------------------------------------
- * sysfs format attributes
- *---------------------------------------------*/
-PMU_FORMAT_ATTR(csource,    "config:0-7");
-PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
-PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
-
-static struct attribute *iommu_format_attrs[] = {
-       &format_attr_csource.attr,
-       &format_attr_devid.attr,
-       &format_attr_pasid.attr,
-       &format_attr_domid.attr,
-       &format_attr_devid_mask.attr,
-       &format_attr_pasid_mask.attr,
-       &format_attr_domid_mask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_iommu_format_group = {
-       .name = "format",
-       .attrs = iommu_format_attrs,
-};
-
-/*---------------------------------------------
- * sysfs events attributes
- *---------------------------------------------*/
-struct amd_iommu_event_desc {
-       struct kobj_attribute attr;
-       const char *event;
-};
-
-static ssize_t _iommu_event_show(struct kobject *kobj,
-                               struct kobj_attribute *attr, char *buf)
-{
-       struct amd_iommu_event_desc *event =
-               container_of(attr, struct amd_iommu_event_desc, attr);
-       return sprintf(buf, "%s\n", event->event);
-}
-
-#define AMD_IOMMU_EVENT_DESC(_name, _event)                    \
-{                                                              \
-       .attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),  \
-       .event = _event,                                        \
-}
-
-static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
-       AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
-       AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
-       AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
-       AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
-       AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
-       AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
-       AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
-       AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
-       AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
-       AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
-       AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
-       AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
-       { /* end: all zeroes */ },
-};
-
-/*---------------------------------------------
- * sysfs cpumask attributes
- *---------------------------------------------*/
-static cpumask_t iommu_cpumask;
-
-static ssize_t _iommu_cpumask_show(struct device *dev,
-                                  struct device_attribute *attr,
-                                  char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
-
-static struct attribute *iommu_cpumask_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_iommu_cpumask_group = {
-       .attrs = iommu_cpumask_attrs,
-};
-
-/*---------------------------------------------*/
-
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
-{
-       unsigned long flags;
-       int shift, bank, cntr, retval;
-       int max_banks = perf_iommu->max_banks;
-       int max_cntrs = perf_iommu->max_counters;
-
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-
-       for (bank = 0, shift = 0; bank < max_banks; bank++) {
-               for (cntr = 0; cntr < max_cntrs; cntr++) {
-                       shift = bank + (bank*3) + cntr;
-                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
-                               continue;
-                       } else {
-                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
-                               goto out;
-                       }
-               }
-       }
-       retval = -ENOSPC;
-out:
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-       return retval;
-}
-
-static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
-                                       u8 bank, u8 cntr)
-{
-       unsigned long flags;
-       int max_banks, max_cntrs;
-       int shift = 0;
-
-       max_banks = perf_iommu->max_banks;
-       max_cntrs = perf_iommu->max_counters;
-
-       if ((bank > max_banks) || (cntr > max_cntrs))
-               return -EINVAL;
-
-       shift = bank + cntr + (bank*3);
-
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-       perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-
-       return 0;
-}
-
-static int perf_iommu_event_init(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_amd_iommu *perf_iommu;
-       u64 config, config1;
-
-       /* test the event attr type check for PMU enumeration */
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /*
-        * IOMMU counters are shared across all cores.
-        * Therefore, it does not support per-process mode.
-        * Also, it does not support event sampling mode.
-        */
-       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-               return -EINVAL;
-
-       /* IOMMU counters do not have usr/os/guest/host bits */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-           event->attr.exclude_host || event->attr.exclude_guest)
-               return -EINVAL;
-
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       perf_iommu = &__perf_iommu;
-
-       if (event->pmu != &perf_iommu->pmu)
-               return -ENOENT;
-
-       if (perf_iommu) {
-               config = event->attr.config;
-               config1 = event->attr.config1;
-       } else {
-               return -EINVAL;
-       }
-
-       /* integrate with iommu base devid (0000), assume one iommu */
-       perf_iommu->max_banks =
-               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-       perf_iommu->max_counters =
-               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-               return -EINVAL;
-
-       /* update the hw_perf_event struct with the iommu config data */
-       hwc->config = config;
-       hwc->extra_reg.config = config1;
-
-       return 0;
-}
-
-static void perf_iommu_enable_event(struct perf_event *ev)
-{
-       u8 csource = _GET_CSOURCE(ev);
-       u16 devid = _GET_DEVID(ev);
-       u64 reg = 0ULL;
-
-       reg = csource;
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-
-       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
-
-       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
-
-       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
-}
-
-static void perf_iommu_disable_event(struct perf_event *event)
-{
-       u64 reg = 0ULL;
-
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                       _GET_BANK(event), _GET_CNTR(event),
-                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-}
-
-static void perf_iommu_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       pr_debug("perf: amd_iommu:perf_iommu_start\n");
-       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-               return;
-
-       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-       hwc->state = 0;
-
-       if (flags & PERF_EF_RELOAD) {
-               u64 prev_raw_count =  local64_read(&hwc->prev_count);
-               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
-       }
-
-       perf_iommu_enable_event(event);
-       perf_event_update_userpage(event);
-
-}
-
-static void perf_iommu_read(struct perf_event *event)
-{
-       u64 count = 0ULL;
-       u64 prev_raw_count = 0ULL;
-       u64 delta = 0ULL;
-       struct hw_perf_event *hwc = &event->hw;
-       pr_debug("perf: amd_iommu:perf_iommu_read\n");
-
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &count, false);
-
-       /* IOMMU pc counter register is only 48 bits */
-       count &= 0xFFFFFFFFFFFFULL;
-
-       prev_raw_count =  local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       count) != prev_raw_count)
-               return;
-
-       /* Handling 48-bit counter overflowing */
-       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
-
-}
-
-static void perf_iommu_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       perf_iommu_disable_event(event);
-       WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-       hwc->state |= PERF_HES_STOPPED;
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       config = hwc->config;
-       perf_iommu_read(event);
-       hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_iommu_add(struct perf_event *event, int flags)
-{
-       int retval;
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-       pr_debug("perf: amd_iommu:perf_iommu_add\n");
-       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       /* request an iommu bank/counter */
-       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-       if (retval != -ENOSPC)
-               event->hw.extra_reg.reg = (u16)retval;
-       else
-               return retval;
-
-       if (flags & PERF_EF_START)
-               perf_iommu_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void perf_iommu_del(struct perf_event *event, int flags)
-{
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-       pr_debug("perf: amd_iommu:perf_iommu_del\n");
-       perf_iommu_stop(event, PERF_EF_UPDATE);
-
-       /* clear the assigned iommu bank/counter */
-       clear_avail_iommu_bnk_cntr(perf_iommu,
-                                    _GET_BANK(event),
-                                    _GET_CNTR(event));
-
-       perf_event_update_userpage(event);
-}
-
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
-{
-       struct attribute **attrs;
-       struct attribute_group *attr_group;
-       int i = 0, j;
-
-       while (amd_iommu_v2_event_descs[i].attr.attr.name)
-               i++;
-
-       attr_group = kzalloc(sizeof(struct attribute *)
-               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-       if (!attr_group)
-               return -ENOMEM;
-
-       attrs = (struct attribute **)(attr_group + 1);
-       for (j = 0; j < i; j++)
-               attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
-
-       attr_group->name = "events";
-       attr_group->attrs = attrs;
-       perf_iommu->events_group = attr_group;
-
-       return 0;
-}
-
-static __init void amd_iommu_pc_exit(void)
-{
-       if (__perf_iommu.events_group != NULL) {
-               kfree(__perf_iommu.events_group);
-               __perf_iommu.events_group = NULL;
-       }
-}
-
-static __init int _init_perf_amd_iommu(
-       struct perf_amd_iommu *perf_iommu, char *name)
-{
-       int ret;
-
-       raw_spin_lock_init(&perf_iommu->lock);
-
-       /* Init format attributes */
-       perf_iommu->format_group = &amd_iommu_format_group;
-
-       /* Init cpumask attributes to only core 0 */
-       cpumask_set_cpu(0, &iommu_cpumask);
-       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-       /* Init events attributes */
-       if (_init_events_attrs(perf_iommu) != 0)
-               pr_err("perf: amd_iommu: Only support raw events.\n");
-
-       /* Init null attributes */
-       perf_iommu->null_group = NULL;
-       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
-
-       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-       if (ret) {
-               pr_err("perf: amd_iommu: Failed to initialized.\n");
-               amd_iommu_pc_exit();
-       } else {
-               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
-       }
-
-       return ret;
-}
-
-static struct perf_amd_iommu __perf_iommu = {
-       .pmu = {
-               .event_init     = perf_iommu_event_init,
-               .add            = perf_iommu_add,
-               .del            = perf_iommu_del,
-               .start          = perf_iommu_start,
-               .stop           = perf_iommu_stop,
-               .read           = perf_iommu_read,
-       },
-       .max_banks              = 0x00,
-       .max_counters           = 0x00,
-       .cntr_assign_mask       = 0ULL,
-       .format_group           = NULL,
-       .cpumask_group          = NULL,
-       .events_group           = NULL,
-       .null_group             = NULL,
-};
-
-static __init int amd_iommu_pc_init(void)
-{
-       /* Make sure the IOMMU PC resource is available */
-       if (!amd_iommu_pc_supported())
-               return -ENODEV;
-
-       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
-
-       return 0;
-}
-
-device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h

deleted file mode 100644 (file)

index 845d173..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _PERF_EVENT_AMD_IOMMU_H_
-#define _PERF_EVENT_AMD_IOMMU_H_
-
-/* iommu pc mmio region register indexes */
-#define IOMMU_PC_COUNTER_REG                   0x00
-#define IOMMU_PC_COUNTER_SRC_REG               0x08
-#define IOMMU_PC_PASID_MATCH_REG               0x10
-#define IOMMU_PC_DOMID_MATCH_REG               0x18
-#define IOMMU_PC_DEVID_MATCH_REG               0x20
-#define IOMMU_PC_COUNTER_REPORT_REG            0x28
-
-/* maximun specified bank/counters */
-#define PC_MAX_SPEC_BNKS                       64
-#define PC_MAX_SPEC_CNTRS                      16
-
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID                       0x0000
-
-/* amd_iommu_init.c external support functions */
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-                       u8 fxn, u64 *value, bool is_write);
-
-#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c

deleted file mode 100644 (file)

index 8836fc9..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ /dev/null
@@ -1,603 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/cpufeature.h>
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-
-#define NUM_COUNTERS_NB                4
-#define NUM_COUNTERS_L2                4
-#define MAX_COUNTERS           NUM_COUNTERS_NB
-
-#define RDPMC_BASE_NB          6
-#define RDPMC_BASE_L2          10
-
-#define COUNTER_SHIFT          16
-
-struct amd_uncore {
-       int id;
-       int refcnt;
-       int cpu;
-       int num_counters;
-       int rdpmc_base;
-       u32 msr_base;
-       cpumask_t *active_mask;
-       struct pmu *pmu;
-       struct perf_event *events[MAX_COUNTERS];
-       struct amd_uncore *free_when_cpu_online;
-};
-
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
-
-static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
-
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
-
-static bool is_nb_event(struct perf_event *event)
-{
-       return event->pmu->type == amd_nb_pmu.type;
-}
-
-static bool is_l2_event(struct perf_event *event)
-{
-       return event->pmu->type == amd_l2_pmu.type;
-}
-
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
-{
-       if (is_nb_event(event) && amd_uncore_nb)
-               return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-       else if (is_l2_event(event) && amd_uncore_l2)
-               return *per_cpu_ptr(amd_uncore_l2, event->cpu);
-
-       return NULL;
-}
-
-static void amd_uncore_read(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev, new;
-       s64 delta;
-
-       /*
-        * since we do not enable counter overflow interrupts,
-        * we do not have to worry about prev_count changing on us
-        */
-
-       prev = local64_read(&hwc->prev_count);
-       rdpmcl(hwc->event_base_rdpmc, new);
-       local64_set(&hwc->prev_count, new);
-       delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
-}
-
-static void amd_uncore_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (flags & PERF_EF_RELOAD)
-               wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
-
-       hwc->state = 0;
-       wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
-       perf_event_update_userpage(event);
-}
-
-static void amd_uncore_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-       hwc->state |= PERF_HES_STOPPED;
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               amd_uncore_read(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int amd_uncore_add(struct perf_event *event, int flags)
-{
-       int i;
-       struct amd_uncore *uncore = event_to_amd_uncore(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       /* are we already assigned? */
-       if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
-               goto out;
-
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (uncore->events[i] == event) {
-                       hwc->idx = i;
-                       goto out;
-               }
-       }
-
-       /* if not, take the first available counter */
-       hwc->idx = -1;
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
-                       hwc->idx = i;
-                       break;
-               }
-       }
-
-out:
-       if (hwc->idx == -1)
-               return -EBUSY;
-
-       hwc->config_base = uncore->msr_base + (2 * hwc->idx);
-       hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
-       hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       if (flags & PERF_EF_START)
-               amd_uncore_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void amd_uncore_del(struct perf_event *event, int flags)
-{
-       int i;
-       struct amd_uncore *uncore = event_to_amd_uncore(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       amd_uncore_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (cmpxchg(&uncore->events[i], event, NULL) == event)
-                       break;
-       }
-
-       hwc->idx = -1;
-}
-
-static int amd_uncore_event_init(struct perf_event *event)
-{
-       struct amd_uncore *uncore;
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /*
-        * NB and L2 counters (MSRs) are shared across all cores that share the
-        * same NB / L2 cache. Interrupts can be directed to a single target
-        * core, however, event counts generated by processes running on other
-        * cores cannot be masked out. So we do not support sampling and
-        * per-thread events.
-        */
-       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-               return -EINVAL;
-
-       /* NB and L2 counters do not have usr/os/guest/host bits */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-           event->attr.exclude_host || event->attr.exclude_guest)
-               return -EINVAL;
-
-       /* and we do not enable counter overflow interrupts */
-       hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
-       hwc->idx = -1;
-
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       uncore = event_to_amd_uncore(event);
-       if (!uncore)
-               return -ENODEV;
-
-       /*
-        * since request can come in to any of the shared cores, we will remap
-        * to a single common cpu.
-        */
-       event->cpu = uncore->cpu;
-
-       return 0;
-}
-
-static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-       cpumask_t *active_mask;
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       if (pmu->type == amd_nb_pmu.type)
-               active_mask = &amd_nb_active_mask;
-       else if (pmu->type == amd_l2_pmu.type)
-               active_mask = &amd_l2_active_mask;
-       else
-               return 0;
-
-       return cpumap_print_to_pagebuf(true, buf, active_mask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
-
-static struct attribute *amd_uncore_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_uncore_attr_group = {
-       .attrs = amd_uncore_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
-       .name = "format",
-       .attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-       &amd_uncore_attr_group,
-       &amd_uncore_format_group,
-       NULL,
-};
-
-static struct pmu amd_nb_pmu = {
-       .attr_groups    = amd_uncore_attr_groups,
-       .name           = "amd_nb",
-       .event_init     = amd_uncore_event_init,
-       .add            = amd_uncore_add,
-       .del            = amd_uncore_del,
-       .start          = amd_uncore_start,
-       .stop           = amd_uncore_stop,
-       .read           = amd_uncore_read,
-};
-
-static struct pmu amd_l2_pmu = {
-       .attr_groups    = amd_uncore_attr_groups,
-       .name           = "amd_l2",
-       .event_init     = amd_uncore_event_init,
-       .add            = amd_uncore_add,
-       .del            = amd_uncore_del,
-       .start          = amd_uncore_start,
-       .stop           = amd_uncore_stop,
-       .read           = amd_uncore_read,
-};
-
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
-{
-       return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
-                       cpu_to_node(cpu));
-}
-
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
-{
-       struct amd_uncore *uncore_nb = NULL, *uncore_l2;
-
-       if (amd_uncore_nb) {
-               uncore_nb = amd_uncore_alloc(cpu);
-               if (!uncore_nb)
-                       goto fail;
-               uncore_nb->cpu = cpu;
-               uncore_nb->num_counters = NUM_COUNTERS_NB;
-               uncore_nb->rdpmc_base = RDPMC_BASE_NB;
-               uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
-               uncore_nb->active_mask = &amd_nb_active_mask;
-               uncore_nb->pmu = &amd_nb_pmu;
-               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
-       }
-
-       if (amd_uncore_l2) {
-               uncore_l2 = amd_uncore_alloc(cpu);
-               if (!uncore_l2)
-                       goto fail;
-               uncore_l2->cpu = cpu;
-               uncore_l2->num_counters = NUM_COUNTERS_L2;
-               uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-               uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-               uncore_l2->active_mask = &amd_l2_active_mask;
-               uncore_l2->pmu = &amd_l2_pmu;
-               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
-       }
-
-       return 0;
-
-fail:
-       if (amd_uncore_nb)
-               *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
-       kfree(uncore_nb);
-       return -ENOMEM;
-}
-
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
-                              struct amd_uncore * __percpu *uncores)
-{
-       unsigned int cpu;
-       struct amd_uncore *that;
-
-       for_each_online_cpu(cpu) {
-               that = *per_cpu_ptr(uncores, cpu);
-
-               if (!that)
-                       continue;
-
-               if (this == that)
-                       continue;
-
-               if (this->id == that->id) {
-                       that->free_when_cpu_online = this;
-                       this = that;
-                       break;
-               }
-       }
-
-       this->refcnt++;
-       return this;
-}
-
-static void amd_uncore_cpu_starting(unsigned int cpu)
-{
-       unsigned int eax, ebx, ecx, edx;
-       struct amd_uncore *uncore;
-
-       if (amd_uncore_nb) {
-               uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
-               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-               uncore->id = ecx & 0xff;
-
-               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
-               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
-       }
-
-       if (amd_uncore_l2) {
-               unsigned int apicid = cpu_data(cpu).apicid;
-               unsigned int nshared;
-
-               uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
-               cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-               nshared = ((eax >> 14) & 0xfff) + 1;
-               uncore->id = apicid - (apicid % nshared);
-
-               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
-       }
-}
-
-static void uncore_online(unsigned int cpu,
-                         struct amd_uncore * __percpu *uncores)
-{
-       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-       kfree(uncore->free_when_cpu_online);
-       uncore->free_when_cpu_online = NULL;
-
-       if (cpu == uncore->cpu)
-               cpumask_set_cpu(cpu, uncore->active_mask);
-}
-
-static void amd_uncore_cpu_online(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_online(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_online(cpu, amd_uncore_l2);
-}
-
-static void uncore_down_prepare(unsigned int cpu,
-                               struct amd_uncore * __percpu *uncores)
-{
-       unsigned int i;
-       struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
-
-       if (this->cpu != cpu)
-               return;
-
-       /* this cpu is going down, migrate to a shared sibling if possible */
-       for_each_online_cpu(i) {
-               struct amd_uncore *that = *per_cpu_ptr(uncores, i);
-
-               if (cpu == i)
-                       continue;
-
-               if (this == that) {
-                       perf_pmu_migrate_context(this->pmu, cpu, i);
-                       cpumask_clear_cpu(cpu, that->active_mask);
-                       cpumask_set_cpu(i, that->active_mask);
-                       that->cpu = i;
-                       break;
-               }
-       }
-}
-
-static void amd_uncore_cpu_down_prepare(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_down_prepare(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_down_prepare(cpu, amd_uncore_l2);
-}
-
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
-{
-       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-       if (cpu == uncore->cpu)
-               cpumask_clear_cpu(cpu, uncore->active_mask);
-
-       if (!--uncore->refcnt)
-               kfree(uncore);
-       *per_cpu_ptr(uncores, cpu) = NULL;
-}
-
-static void amd_uncore_cpu_dead(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_dead(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_dead(cpu, amd_uncore_l2);
-}
-
-static int
-amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
-                       void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               if (amd_uncore_cpu_up_prepare(cpu))
-                       return notifier_from_errno(-ENOMEM);
-               break;
-
-       case CPU_STARTING:
-               amd_uncore_cpu_starting(cpu);
-               break;
-
-       case CPU_ONLINE:
-               amd_uncore_cpu_online(cpu);
-               break;
-
-       case CPU_DOWN_PREPARE:
-               amd_uncore_cpu_down_prepare(cpu);
-               break;
-
-       case CPU_UP_CANCELED:
-       case CPU_DEAD:
-               amd_uncore_cpu_dead(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block amd_uncore_cpu_notifier_block = {
-       .notifier_call  = amd_uncore_cpu_notifier,
-       .priority       = CPU_PRI_PERF + 1,
-};
-
-static void __init init_cpu_already_online(void *dummy)
-{
-       unsigned int cpu = smp_processor_id();
-
-       amd_uncore_cpu_starting(cpu);
-       amd_uncore_cpu_online(cpu);
-}
-
-static void cleanup_cpu_online(void *dummy)
-{
-       unsigned int cpu = smp_processor_id();
-
-       amd_uncore_cpu_dead(cpu);
-}
-
-static int __init amd_uncore_init(void)
-{
-       unsigned int cpu, cpu2;
-       int ret = -ENODEV;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-               goto fail_nodev;
-
-       if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-               goto fail_nodev;
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-               amd_uncore_nb = alloc_percpu(struct amd_uncore *);
-               if (!amd_uncore_nb) {
-                       ret = -ENOMEM;
-                       goto fail_nb;
-               }
-               ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-               if (ret)
-                       goto fail_nb;
-
-               printk(KERN_INFO "perf: AMD NB counters detected\n");
-               ret = 0;
-       }
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-               amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-               if (!amd_uncore_l2) {
-                       ret = -ENOMEM;
-                       goto fail_l2;
-               }
-               ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
-               if (ret)
-                       goto fail_l2;
-
-               printk(KERN_INFO "perf: AMD L2I counters detected\n");
-               ret = 0;
-       }
-
-       if (ret)
-               goto fail_nodev;
-
-       cpu_notifier_register_begin();
-
-       /* init cpus already online before registering for hotplug notifier */
-       for_each_online_cpu(cpu) {
-               ret = amd_uncore_cpu_up_prepare(cpu);
-               if (ret)
-                       goto fail_online;
-               smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
-       }
-
-       __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
-       cpu_notifier_register_done();
-
-       return 0;
-
-
-fail_online:
-       for_each_online_cpu(cpu2) {
-               if (cpu2 == cpu)
-                       break;
-               smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
-       }
-       cpu_notifier_register_done();
-
-       /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
-       amd_uncore_nb = amd_uncore_l2 = NULL;
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
-               perf_pmu_unregister(&amd_l2_pmu);
-fail_l2:
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
-               perf_pmu_unregister(&amd_nb_pmu);
-       if (amd_uncore_l2)
-               free_percpu(amd_uncore_l2);
-fail_nb:
-       if (amd_uncore_nb)
-               free_percpu(amd_uncore_nb);
-
-fail_nodev:
-       return ret;
-}
-device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

deleted file mode 100644 (file)

index fed2ab1..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ /dev/null
@@ -1,3773 +0,0 @@
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/nmi.h>
-
-#include <asm/cpufeature.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-       [PERF_COUNT_HW_CPU_CYCLES]              = 0x003c,
-       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x00c0,
-       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4f2e,
-       [PERF_COUNT_HW_CACHE_MISSES]            = 0x412e,
-       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x00c4,
-       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x00c5,
-       [PERF_COUNT_HW_BUS_CYCLES]              = 0x013c,
-       [PERF_COUNT_HW_REF_CPU_CYCLES]          = 0x0300, /* pseudo-encoding */
-};
-
-static struct event_constraint intel_core_event_constraints[] __read_mostly =
-{
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-       INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-       INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
-       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-       INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
-       INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
-       INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
-       INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
-       INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
-       INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
-       INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
-       INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
-       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-       EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-       INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
-       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-       INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_snb_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
-       INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-       EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_v1_event_constraints[] __read_mostly =
-{
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_slm_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
-       INTEL_UEVENT_EXTRA_REG(0x01b7,
-                              MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7,
-                              MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       /*
-        * Note the low 8 bits eventsel code is not a continuous field, containing
-        * some #GPing bits. These are masked out.
-        */
-       INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
-       EVENT_EXTRA_END
-};
-
-EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
-
-struct attribute *nhm_events_attrs[] = {
-       EVENT_PTR(mem_ld_nhm),
-       NULL,
-};
-
-struct attribute *snb_events_attrs[] = {
-       EVENT_PTR(mem_ld_snb),
-       EVENT_PTR(mem_st_snb),
-       NULL,
-};
-
-static struct event_constraint intel_hsw_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-       /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
-       /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
-       /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_bdw_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
-       INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),        /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
-       EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-       return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts.
- * - icache miss does not include decoded icache
- */
-
-#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
-#define SKL_DEMAND_RFO                 BIT_ULL(1)
-#define SKL_ANY_RESPONSE               BIT_ULL(16)
-#define SKL_SUPPLIER_NONE              BIT_ULL(17)
-#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
-#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
-#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
-#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
-#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-#define SKL_SPL_HIT                    BIT_ULL(30)
-#define SKL_SNOOP_NONE                 BIT_ULL(31)
-#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
-#define SKL_SNOOP_MISS                 BIT_ULL(33)
-#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
-#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
-#define SKL_SNOOP_HITM                 BIT_ULL(36)
-#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
-#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
-                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
-#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
-#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
-                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
-#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
-#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
-#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-
-static __initconst const u64 skl_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
-               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 skl_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS|SKL_ANY_SNOOP|
-                                      SKL_SUPPLIER_NONE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS|SKL_ANY_SNOOP|
-                                      SKL_SUPPLIER_NONE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-#define SNB_DMND_DATA_RD       (1ULL << 0)
-#define SNB_DMND_RFO           (1ULL << 1)
-#define SNB_DMND_IFETCH                (1ULL << 2)
-#define SNB_DMND_WB            (1ULL << 3)
-#define SNB_PF_DATA_RD         (1ULL << 4)
-#define SNB_PF_RFO             (1ULL << 5)
-#define SNB_PF_IFETCH          (1ULL << 6)
-#define SNB_LLC_DATA_RD                (1ULL << 7)
-#define SNB_LLC_RFO            (1ULL << 8)
-#define SNB_LLC_IFETCH         (1ULL << 9)
-#define SNB_BUS_LOCKS          (1ULL << 10)
-#define SNB_STRM_ST            (1ULL << 11)
-#define SNB_OTHER              (1ULL << 15)
-#define SNB_RESP_ANY           (1ULL << 16)
-#define SNB_NO_SUPP            (1ULL << 17)
-#define SNB_LLC_HITM           (1ULL << 18)
-#define SNB_LLC_HITE           (1ULL << 19)
-#define SNB_LLC_HITS           (1ULL << 20)
-#define SNB_LLC_HITF           (1ULL << 21)
-#define SNB_LOCAL              (1ULL << 22)
-#define SNB_REMOTE             (0xffULL << 23)
-#define SNB_SNP_NONE           (1ULL << 31)
-#define SNB_SNP_NOT_NEEDED     (1ULL << 32)
-#define SNB_SNP_MISS           (1ULL << 33)
-#define SNB_NO_FWD             (1ULL << 34)
-#define SNB_SNP_FWD            (1ULL << 35)
-#define SNB_HITM               (1ULL << 36)
-#define SNB_NON_DRAM           (1ULL << 37)
-
-#define SNB_DMND_READ          (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
-#define SNB_DMND_WRITE         (SNB_DMND_RFO|SNB_LLC_RFO)
-#define SNB_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SNB_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-                                SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-                                SNB_HITM)
-
-#define SNB_DRAM_ANY           (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE                (SNB_REMOTE|SNB_SNP_ANY)
-
-#define SNB_L3_ACCESS          SNB_RESP_ANY
-#define SNB_L3_MISS            (SNB_DRAM_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 snb_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
-       },
- },
-};
-
-static __initconst const u64 snb_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
-               [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
-               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-
-};
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts because they are not
- *   reliably counted.
- */
-
-#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
-#define HSW_DEMAND_RFO                 BIT_ULL(1)
-#define HSW_ANY_RESPONSE               BIT_ULL(16)
-#define HSW_SUPPLIER_NONE              BIT_ULL(17)
-#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
-#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
-#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
-#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
-#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
-                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-                                        HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_SNOOP_NONE                 BIT_ULL(31)
-#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
-#define HSW_SNOOP_MISS                 BIT_ULL(33)
-#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
-#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
-#define HSW_SNOOP_HITM                 BIT_ULL(36)
-#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
-#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
-                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
-                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
-                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
-#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
-#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
-#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
-#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
-                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
-
-#define BDW_L3_MISS_LOCAL              BIT(26)
-#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
-                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-                                        HSW_L3_MISS_REMOTE_HOP2P)
-
-
-static __initconst const u64 hsw_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
-               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 hsw_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-                                      HSW_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS|HSW_ANY_SNOOP,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-                                      HSW_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS|HSW_ANY_SNOOP,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS_LOCAL_DRAM|
-                                      HSW_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS_REMOTE|
-                                      HSW_SNOOP_DRAM,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS_LOCAL_DRAM|
-                                      HSW_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS_REMOTE|
-                                      HSW_SNOOP_DRAM,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 westmere_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       /*
-        * Use RFO, not WRITEBACK, because a write miss would typically occur
-        * on RFO.
-        */
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-};
-
-/*
- * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
- * See IA32 SDM Vol 3B 30.6.1.3
- */
-
-#define NHM_DMND_DATA_RD       (1 << 0)
-#define NHM_DMND_RFO           (1 << 1)
-#define NHM_DMND_IFETCH                (1 << 2)
-#define NHM_DMND_WB            (1 << 3)
-#define NHM_PF_DATA_RD         (1 << 4)
-#define NHM_PF_DATA_RFO                (1 << 5)
-#define NHM_PF_IFETCH          (1 << 6)
-#define NHM_OFFCORE_OTHER      (1 << 7)
-#define NHM_UNCORE_HIT         (1 << 8)
-#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
-#define NHM_OTHER_CORE_HITM    (1 << 10)
-                               /* reserved */
-#define NHM_REMOTE_CACHE_FWD   (1 << 12)
-#define NHM_REMOTE_DRAM                (1 << 13)
-#define NHM_LOCAL_DRAM         (1 << 14)
-#define NHM_NON_DRAM           (1 << 15)
-
-#define NHM_LOCAL              (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_REMOTE             (NHM_REMOTE_DRAM)
-
-#define NHM_DMND_READ          (NHM_DMND_DATA_RD)
-#define NHM_DMND_WRITE         (NHM_DMND_RFO|NHM_DMND_WB)
-#define NHM_DMND_PREFETCH      (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
-
-#define NHM_L3_HIT     (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS    (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_L3_ACCESS  (NHM_L3_HIT|NHM_L3_MISS)
-
-static __initconst const u64 nehalem_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
-       },
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       /*
-        * Use RFO, not WRITEBACK, because a write miss would typically occur
-        * on RFO.
-        */
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static struct extra_reg intel_slm_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
-       EVENT_EXTRA_END
-};
-
-#define SLM_DMND_READ          SNB_DMND_DATA_RD
-#define SLM_DMND_WRITE         SNB_DMND_RFO
-#define SLM_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SLM_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
-#define SLM_LLC_ACCESS         SNB_RESP_ANY
-#define SLM_LLC_MISS           (SLM_SNP_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 slm_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
-       },
- },
-};
-
-static __initconst const u64 slm_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
-               [ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
-               [ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-#define KNL_OT_L2_HITE         BIT_ULL(19) /* Other Tile L2 Hit */
-#define KNL_OT_L2_HITF         BIT_ULL(20) /* Other Tile L2 Hit */
-#define KNL_MCDRAM_LOCAL       BIT_ULL(21)
-#define KNL_MCDRAM_FAR         BIT_ULL(22)
-#define KNL_DDR_LOCAL          BIT_ULL(23)
-#define KNL_DDR_FAR            BIT_ULL(24)
-#define KNL_DRAM_ANY           (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
-                                   KNL_DDR_LOCAL | KNL_DDR_FAR)
-#define KNL_L2_READ            SLM_DMND_READ
-#define KNL_L2_WRITE           SLM_DMND_WRITE
-#define KNL_L2_PREFETCH                SLM_DMND_PREFETCH
-#define KNL_L2_ACCESS          SLM_LLC_ACCESS
-#define KNL_L2_MISS            (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
-                                  KNL_DRAM_ANY | SNB_SNP_ANY | \
-                                                 SNB_NON_DRAM)
-
-static __initconst const u64 knl_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-       [C(LL)] = {
-               [C(OP_READ)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = 0,
-               },
-               [C(OP_WRITE)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
-               },
-               [C(OP_PREFETCH)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
-               },
-       },
-};
-
-/*
- * Use from PMIs where the LBRs are already disabled.
- */
-static void __intel_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               intel_pmu_disable_bts();
-       else
-               intel_bts_disable_local();
-
-       intel_pmu_pebs_disable_all();
-}
-
-static void intel_pmu_disable_all(void)
-{
-       __intel_pmu_disable_all();
-       intel_pmu_lbr_disable_all();
-}
-
-static void __intel_pmu_enable_all(int added, bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       intel_pmu_pebs_enable_all();
-       intel_pmu_lbr_enable_all(pmi);
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-                       x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-               struct perf_event *event =
-                       cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-
-               if (WARN_ON_ONCE(!event))
-                       return;
-
-               intel_pmu_enable_bts(event->hw.config);
-       } else
-               intel_bts_enable_local();
-}
-
-static void intel_pmu_enable_all(int added)
-{
-       __intel_pmu_enable_all(added, false);
-}
-
-/*
- * Workaround for:
- *   Intel Errata AAK100 (model 26)
- *   Intel Errata AAP53  (model 30)
- *   Intel Errata BD53   (model 44)
- *
- * The official story:
- *   These chips need to be 'reset' when adding counters by programming the
- *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
- *   in sequence on the same PMC or on different PMCs.
- *
- * In practise it appears some of these events do in fact count, and
- * we need to programm all 4 events.
- */
-static void intel_pmu_nhm_workaround(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       static const unsigned long nhm_magic[4] = {
-               0x4300B5,
-               0x4300D2,
-               0x4300B1,
-               0x4300B1
-       };
-       struct perf_event *event;
-       int i;
-
-       /*
-        * The Errata requires below steps:
-        * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
-        * 2) Configure 4 PERFEVTSELx with the magic events and clear
-        *    the corresponding PMCx;
-        * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
-        * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
-        * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
-        */
-
-       /*
-        * The real steps we choose are a little different from above.
-        * A) To reduce MSR operations, we don't run step 1) as they
-        *    are already cleared before this function is called;
-        * B) Call x86_perf_event_update to save PMCx before configuring
-        *    PERFEVTSELx with magic number;
-        * C) With step 5), we do clear only when the PERFEVTSELx is
-        *    not used currently.
-        * D) Call x86_perf_event_set_period to restore PMCx;
-        */
-
-       /* We always operate 4 pairs of PERF Counters */
-       for (i = 0; i < 4; i++) {
-               event = cpuc->events[i];
-               if (event)
-                       x86_perf_event_update(event);
-       }
-
-       for (i = 0; i < 4; i++) {
-               wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
-               wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
-       }
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
-       for (i = 0; i < 4; i++) {
-               event = cpuc->events[i];
-
-               if (event) {
-                       x86_perf_event_set_period(event);
-                       __x86_pmu_enable_event(&event->hw,
-                                       ARCH_PERFMON_EVENTSEL_ENABLE);
-               } else
-                       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
-       }
-}
-
-static void intel_pmu_nhm_enable_all(int added)
-{
-       if (added)
-               intel_pmu_nhm_workaround();
-       intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
-       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-       u64 ctrl_val, mask;
-
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline bool event_is_checkpointed(struct perf_event *event)
-{
-       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-               intel_pmu_disable_bts();
-               intel_pmu_drain_bts_buffer();
-               return;
-       }
-
-       cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-       cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
-
-       /*
-        * must disable before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_disable(event);
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_disable_fixed(hwc);
-               return;
-       }
-
-       x86_pmu_disable_event(event);
-
-       if (unlikely(event->attr.precise_ip))
-               intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
-       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-       u64 ctrl_val, bits, mask;
-
-       /*
-        * Enable IRQ generation (0x8),
-        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-        * if requested:
-        */
-       bits = 0x8ULL;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-               bits |= 0x2;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-               bits |= 0x1;
-
-       /*
-        * ANY bit is supported in v3 and up
-        */
-       if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
-               bits |= 0x4;
-
-       bits <<= (idx * 4);
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       ctrl_val |= bits;
-       wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-               if (!__this_cpu_read(cpu_hw_events.enabled))
-                       return;
-
-               intel_pmu_enable_bts(hwc->config);
-               return;
-       }
-       /*
-        * must enabled before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_enable(event);
-
-       if (event->attr.exclude_host)
-               cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
-       if (event->attr.exclude_guest)
-               cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
-       if (unlikely(event_is_checkpointed(event)))
-               cpuc->intel_cp_status |= (1ull << hwc->idx);
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_enable_fixed(hwc);
-               return;
-       }
-
-       if (unlikely(event->attr.precise_ip))
-               intel_pmu_pebs_enable(event);
-
-       __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-int intel_pmu_save_and_restart(struct perf_event *event)
-{
-       x86_perf_event_update(event);
-       /*
-        * For a checkpointed counter always reset back to 0.  This
-        * avoids a situation where the counter overflows, aborts the
-        * transaction and is then set back to shortly before the
-        * overflow, and overflows and aborts again.
-        */
-       if (unlikely(event_is_checkpointed(event))) {
-               /* No race with NMIs because the counter should not be armed */
-               wrmsrl(event->hw.event_base, 0);
-               local64_set(&event->hw.prev_count, 0);
-       }
-       return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
-       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-       unsigned long flags;
-       int idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
-               wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
-               wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
-       if (ds)
-               ds->bts_index = ds->bts_buffer_base;
-
-       /* Ack all overflows and disable fixed counters */
-       if (x86_pmu.version >= 2) {
-               intel_pmu_ack_status(intel_pmu_get_status());
-               wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-       }
-
-       /* Reset LBRs and LBR freezing */
-       if (x86_pmu.lbr_nr) {
-               update_debugctlmsr(get_debugctlmsr() &
-                       ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
-       }
-
-       local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       int bit, loops;
-       u64 status;
-       int handled;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * No known reason to not always do late ACK,
-        * but just in case do it opt-in.
-        */
-       if (!x86_pmu.late_ack)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       __intel_pmu_disable_all();
-       handled = intel_pmu_drain_bts_buffer();
-       handled += intel_bts_interrupt();
-       status = intel_pmu_get_status();
-       if (!status)
-               goto done;
-
-       loops = 0;
-again:
-       intel_pmu_lbr_read();
-       intel_pmu_ack_status(status);
-       if (++loops > 100) {
-               static bool warned = false;
-               if (!warned) {
-                       WARN(1, "perfevents: irq loop stuck!\n");
-                       perf_event_print_debug();
-                       warned = true;
-               }
-               intel_pmu_reset();
-               goto done;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-
-
-       /*
-        * Ignore a range of extra bits in status that do not indicate
-        * overflow by themselves.
-        */
-       status &= ~(GLOBAL_STATUS_COND_CHG |
-                   GLOBAL_STATUS_ASIF |
-                   GLOBAL_STATUS_LBRS_FROZEN);
-       if (!status)
-               goto done;
-
-       /*
-        * PEBS overflow sets bit 62 in the global status register
-        */
-       if (__test_and_clear_bit(62, (unsigned long *)&status)) {
-               handled++;
-               x86_pmu.drain_pebs(regs);
-       }
-
-       /*
-        * Intel PT
-        */
-       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
-               handled++;
-               intel_pt_interrupt();
-       }
-
-       /*
-        * Checkpointed counters can lead to 'spurious' PMIs because the
-        * rollback caused by the PMI will have cleared the overflow status
-        * bit. Therefore always force probe these counters.
-        */
-       status |= cpuc->intel_cp_status;
-
-       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_event *event = cpuc->events[bit];
-
-               handled++;
-
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (has_branch_stack(event))
-                       data.br_stack = &cpuc->lbr_stack;
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = intel_pmu_get_status();
-       if (status)
-               goto again;
-
-done:
-       __intel_pmu_enable_all(0, true);
-       /*
-        * Only unmask the NMI after the overflow counters
-        * have been reset. This avoids spurious NMIs on
-        * Haswell CPUs.
-        */
-       if (x86_pmu.late_ack)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       return handled;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned int hw_event, bts_event;
-
-       if (event->attr.freq)
-               return NULL;
-
-       hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
-       bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
-       if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
-               return &bts_constraint;
-
-       return NULL;
-}
-
-static int intel_alt_er(int idx, u64 config)
-{
-       int alt_idx = idx;
-
-       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
-               return idx;
-
-       if (idx == EXTRA_REG_RSP_0)
-               alt_idx = EXTRA_REG_RSP_1;
-
-       if (idx == EXTRA_REG_RSP_1)
-               alt_idx = EXTRA_REG_RSP_0;
-
-       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
-               return idx;
-
-       return alt_idx;
-}
-
-static void intel_fixup_er(struct perf_event *event, int idx)
-{
-       event->hw.extra_reg.idx = idx;
-
-       if (idx == EXTRA_REG_RSP_0) {
-               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
-               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
-       } else if (idx == EXTRA_REG_RSP_1) {
-               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
-               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
-       }
-}
-
-/*
- * manage allocation of shared extra msr for certain events
- *
- * sharing can be:
- * per-cpu: to be shared between the various events on a single PMU
- * per-core: per-cpu + shared by HT threads
- */
-static struct event_constraint *
-__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-                                  struct perf_event *event,
-                                  struct hw_perf_event_extra *reg)
-{
-       struct event_constraint *c = &emptyconstraint;
-       struct er_account *era;
-       unsigned long flags;
-       int idx = reg->idx;
-
-       /*
-        * reg->alloc can be set due to existing state, so for fake cpuc we
-        * need to ignore this, otherwise we might fail to allocate proper fake
-        * state for this extra reg constraint. Also see the comment below.
-        */
-       if (reg->alloc && !cpuc->is_fake)
-               return NULL; /* call x86_get_event_constraint() */
-
-again:
-       era = &cpuc->shared_regs->regs[idx];
-       /*
-        * we use spin_lock_irqsave() to avoid lockdep issues when
-        * passing a fake cpuc
-        */
-       raw_spin_lock_irqsave(&era->lock, flags);
-
-       if (!atomic_read(&era->ref) || era->config == reg->config) {
-
-               /*
-                * If its a fake cpuc -- as per validate_{group,event}() we
-                * shouldn't touch event state and we can avoid doing so
-                * since both will only call get_event_constraints() once
-                * on each event, this avoids the need for reg->alloc.
-                *
-                * Not doing the ER fixup will only result in era->reg being
-                * wrong, but since we won't actually try and program hardware
-                * this isn't a problem either.
-                */
-               if (!cpuc->is_fake) {
-                       if (idx != reg->idx)
-                               intel_fixup_er(event, idx);
-
-                       /*
-                        * x86_schedule_events() can call get_event_constraints()
-                        * multiple times on events in the case of incremental
-                        * scheduling(). reg->alloc ensures we only do the ER
-                        * allocation once.
-                        */
-                       reg->alloc = 1;
-               }
-
-               /* lock in msr value */
-               era->config = reg->config;
-               era->reg = reg->reg;
-
-               /* one more user */
-               atomic_inc(&era->ref);
-
-               /*
-                * need to call x86_get_event_constraint()
-                * to check if associated event has constraints
-                */
-               c = NULL;
-       } else {
-               idx = intel_alt_er(idx, reg->config);
-               if (idx != reg->idx) {
-                       raw_spin_unlock_irqrestore(&era->lock, flags);
-                       goto again;
-               }
-       }
-       raw_spin_unlock_irqrestore(&era->lock, flags);
-
-       return c;
-}
-
-static void
-__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
-                                  struct hw_perf_event_extra *reg)
-{
-       struct er_account *era;
-
-       /*
-        * Only put constraint if extra reg was actually allocated. Also takes
-        * care of event which do not use an extra shared reg.
-        *
-        * Also, if this is a fake cpuc we shouldn't touch any event state
-        * (reg->alloc) and we don't care about leaving inconsistent cpuc state
-        * either since it'll be thrown out.
-        */
-       if (!reg->alloc || cpuc->is_fake)
-               return;
-
-       era = &cpuc->shared_regs->regs[reg->idx];
-
-       /* one fewer user */
-       atomic_dec(&era->ref);
-
-       /* allocate again next time */
-       reg->alloc = 0;
-}
-
-static struct event_constraint *
-intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
-                             struct perf_event *event)
-{
-       struct event_constraint *c = NULL, *d;
-       struct hw_perf_event_extra *xreg, *breg;
-
-       xreg = &event->hw.extra_reg;
-       if (xreg->idx != EXTRA_REG_NONE) {
-               c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
-               if (c == &emptyconstraint)
-                       return c;
-       }
-       breg = &event->hw.branch_reg;
-       if (breg->idx != EXTRA_REG_NONE) {
-               d = __intel_shared_reg_get_constraints(cpuc, event, breg);
-               if (d == &emptyconstraint) {
-                       __intel_shared_reg_put_constraints(cpuc, xreg);
-                       c = d;
-               }
-       }
-       return c;
-}
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       if (x86_pmu.event_constraints) {
-               for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if ((event->hw.config & c->cmask) == c->code) {
-                               event->hw.flags |= c->flags;
-                               return c;
-                       }
-               }
-       }
-
-       return &unconstrained;
-}
-
-static struct event_constraint *
-__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                           struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       c = intel_bts_constraints(event);
-       if (c)
-               return c;
-
-       c = intel_shared_regs_constraints(cpuc, event);
-       if (c)
-               return c;
-
-       c = intel_pebs_constraints(event);
-       if (c)
-               return c;
-
-       return x86_get_event_constraints(cpuc, idx, event);
-}
-
-static void
-intel_start_scheduling(struct cpu_hw_events *cpuc)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       xl->sched_started = true;
-       /*
-        * lock shared state until we are done scheduling
-        * in stop_event_scheduling()
-        * makes scheduling appear as a transaction
-        */
-       raw_spin_lock(&excl_cntrs->lock);
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct event_constraint *c = cpuc->event_constraint[idx];
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       lockdep_assert_held(&excl_cntrs->lock);
-
-       if (c->flags & PERF_X86_EVENT_EXCL)
-               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
-       else
-               xl->state[cntr] = INTEL_EXCL_SHARED;
-}
-
-static void
-intel_stop_scheduling(struct cpu_hw_events *cpuc)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       xl->sched_started = false;
-       /*
-        * release shared state lock (acquired in intel_start_scheduling())
-        */
-       raw_spin_unlock(&excl_cntrs->lock);
-}
-
-static struct event_constraint *
-intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-                          int idx, struct event_constraint *c)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xlo;
-       int tid = cpuc->excl_thread_id;
-       int is_excl, i;
-
-       /*
-        * validating a group does not require
-        * enforcing cross-thread  exclusion
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return c;
-
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return c;
-
-       /*
-        * because we modify the constraint, we need
-        * to make a copy. Static constraints come
-        * from static const tables.
-        *
-        * only needed when constraint has not yet
-        * been cloned (marked dynamic)
-        */
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-               struct event_constraint *cx;
-
-               /*
-                * grab pre-allocated constraint entry
-                */
-               cx = &cpuc->constraint_list[idx];
-
-               /*
-                * initialize dynamic constraint
-                * with static constraint
-                */
-               *cx = *c;
-
-               /*
-                * mark constraint as dynamic, so we
-                * can free it later on
-                */
-               cx->flags |= PERF_X86_EVENT_DYNAMIC;
-               c = cx;
-       }
-
-       /*
-        * From here on, the constraint is dynamic.
-        * Either it was just allocated above, or it
-        * was allocated during a earlier invocation
-        * of this function
-        */
-
-       /*
-        * state of sibling HT
-        */
-       xlo = &excl_cntrs->states[tid ^ 1];
-
-       /*
-        * event requires exclusive counter access
-        * across HT threads
-        */
-       is_excl = c->flags & PERF_X86_EVENT_EXCL;
-       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
-               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
-               if (!cpuc->n_excl++)
-                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
-       }
-
-       /*
-        * Modify static constraint with current dynamic
-        * state of thread
-        *
-        * EXCLUSIVE: sibling counter measuring exclusive event
-        * SHARED   : sibling counter measuring non-exclusive event
-        * UNUSED   : sibling counter unused
-        */
-       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
-               /*
-                * exclusive event in sibling counter
-                * our corresponding counter cannot be used
-                * regardless of our event
-                */
-               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
-                       __clear_bit(i, c->idxmsk);
-               /*
-                * if measuring an exclusive event, sibling
-                * measuring non-exclusive, then counter cannot
-                * be used
-                */
-               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
-                       __clear_bit(i, c->idxmsk);
-       }
-
-       /*
-        * recompute actual bit weight for scheduling algorithm
-        */
-       c->weight = hweight64(c->idxmsk64);
-
-       /*
-        * if we return an empty mask, then switch
-        * back to static empty constraint to avoid
-        * the cost of freeing later on
-        */
-       if (c->weight == 0)
-               c = &emptyconstraint;
-
-       return c;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                           struct perf_event *event)
-{
-       struct event_constraint *c1 = NULL;
-       struct event_constraint *c2;
-
-       if (idx >= 0) /* fake does < 0 */
-               c1 = cpuc->event_constraint[idx];
-
-       /*
-        * first time only
-        * - static constraint: no change across incremental scheduling calls
-        * - dynamic constraint: handled by intel_get_excl_constraints()
-        */
-       c2 = __intel_get_event_constraints(cpuc, idx, event);
-       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
-               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
-               c1->weight = c2->weight;
-               c2 = c1;
-       }
-
-       if (cpuc->excl_cntrs)
-               return intel_get_excl_constraints(cpuc, event, idx, c2);
-
-       return c2;
-}
-
-static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-               struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       int tid = cpuc->excl_thread_id;
-       struct intel_excl_states *xl;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake)
-               return;
-
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
-               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
-               if (!--cpuc->n_excl)
-                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
-       }
-
-       /*
-        * If event was actually assigned, then mark the counter state as
-        * unused now.
-        */
-       if (hwc->idx >= 0) {
-               xl = &excl_cntrs->states[tid];
-
-               /*
-                * put_constraint may be called from x86_schedule_events()
-                * which already has the lock held so here make locking
-                * conditional.
-                */
-               if (!xl->sched_started)
-                       raw_spin_lock(&excl_cntrs->lock);
-
-               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
-
-               if (!xl->sched_started)
-                       raw_spin_unlock(&excl_cntrs->lock);
-       }
-}
-
-static void
-intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
-                                       struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-
-       reg = &event->hw.extra_reg;
-       if (reg->idx != EXTRA_REG_NONE)
-               __intel_shared_reg_put_constraints(cpuc, reg);
-
-       reg = &event->hw.branch_reg;
-       if (reg->idx != EXTRA_REG_NONE)
-               __intel_shared_reg_put_constraints(cpuc, reg);
-}
-
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-                                       struct perf_event *event)
-{
-       intel_put_shared_regs_event_constraints(cpuc, event);
-
-       /*
-        * is PMU has exclusive counter restrictions, then
-        * all events are subject to and must call the
-        * put_excl_constraints() routine
-        */
-       if (cpuc->excl_cntrs)
-               intel_put_excl_constraints(cpuc, event);
-}
-
-static void intel_pebs_aliases_core2(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use INST_RETIRED.ANY_P
-                * (0x00c0), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * INST_RETIRED.ANY_P counts the number of cycles that retires
-                * CNTMASK instructions. By setting CNTMASK to a value (16)
-                * larger than the maximum number of instructions that can be
-                * retired per cycle (4) and then inverting the condition, we
-                * count all cycles that retire 16 or less instructions, which
-                * is every cycle.
-                *
-                * Thereby we gain a PEBS capable cycle counter.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_snb(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use UOPS_RETIRED.ALL
-                * (0x01c2), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * UOPS_RETIRED.ALL counts the number of cycles that retires
-                * CNTMASK micro-ops. By setting CNTMASK to a value (16)
-                * larger than the maximum number of micro-ops that can be
-                * retired per cycle (4) and then inverting the condition, we
-                * count all cycles that retire 16 or less micro-ops, which
-                * is every cycle.
-                *
-                * Thereby we gain a PEBS capable cycle counter.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_precdist(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use INST_RETIRED.PREC_DIST
-                * (0x01c0), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * The PREC_DIST event has special support to minimize sample
-                * shadowing effects. One drawback is that it can be
-                * only programmed on counter 1, but that seems like an
-                * acceptable trade off.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_ivb(struct perf_event *event)
-{
-       if (event->attr.precise_ip < 3)
-               return intel_pebs_aliases_snb(event);
-       return intel_pebs_aliases_precdist(event);
-}
-
-static void intel_pebs_aliases_skl(struct perf_event *event)
-{
-       if (event->attr.precise_ip < 3)
-               return intel_pebs_aliases_core2(event);
-       return intel_pebs_aliases_precdist(event);
-}
-
-static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
-{
-       unsigned long flags = x86_pmu.free_running_flags;
-
-       if (event->attr.use_clockid)
-               flags &= ~PERF_SAMPLE_TIME;
-       return flags;
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
-       int ret = x86_pmu_hw_config(event);
-
-       if (ret)
-               return ret;
-
-       if (event->attr.precise_ip) {
-               if (!event->attr.freq) {
-                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-                       if (!(event->attr.sample_type &
-                             ~intel_pmu_free_running_flags(event)))
-                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
-               }
-               if (x86_pmu.pebs_aliases)
-                       x86_pmu.pebs_aliases(event);
-       }
-
-       if (needs_branch_stack(event)) {
-               ret = intel_pmu_setup_lbr_filter(event);
-               if (ret)
-                       return ret;
-
-               /*
-                * BTS is set up earlier in this path, so don't account twice
-                */
-               if (!intel_pmu_has_bts(event)) {
-                       /* disallow lbr if conflicting events are present */
-                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-                               return -EBUSY;
-
-                       event->destroy = hw_perf_lbr_event_destroy;
-               }
-       }
-
-       if (event->attr.type != PERF_TYPE_RAW)
-               return 0;
-
-       if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
-               return 0;
-
-       if (x86_pmu.version < 3)
-               return -EINVAL;
-
-       if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
-       return 0;
-}
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-       if (x86_pmu.guest_get_msrs)
-               return x86_pmu.guest_get_msrs(nr);
-       *nr = 0;
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-
-       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-       arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-       arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-       /*
-        * If PMU counter has PEBS enabled it is not enough to disable counter
-        * on a guest entry since PEBS memory write can overshoot guest entry
-        * and corrupt guest memory. Disabling PEBS solves the problem.
-        */
-       arr[1].msr = MSR_IA32_PEBS_ENABLE;
-       arr[1].host = cpuc->pebs_enabled;
-       arr[1].guest = 0;
-
-       *nr = 2;
-       return arr;
-}
-
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
-               struct perf_event *event = cpuc->events[idx];
-
-               arr[idx].msr = x86_pmu_config_addr(idx);
-               arr[idx].host = arr[idx].guest = 0;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               arr[idx].host = arr[idx].guest =
-                       event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
-
-               if (event->attr.exclude_host)
-                       arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-               else if (event->attr.exclude_guest)
-                       arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-       }
-
-       *nr = x86_pmu.num_counters;
-       return arr;
-}
-
-static void core_pmu_enable_event(struct perf_event *event)
-{
-       if (!event->attr.exclude_host)
-               x86_pmu_enable_event(event);
-}
-
-static void core_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-               if (!test_bit(idx, cpuc->active_mask) ||
-                               cpuc->events[idx]->attr.exclude_host)
-                       continue;
-
-               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-       }
-}
-
-static int hsw_hw_config(struct perf_event *event)
-{
-       int ret = intel_pmu_hw_config(event);
-
-       if (ret)
-               return ret;
-       if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
-               return 0;
-       event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
-
-       /*
-        * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
-        * PEBS or in ANY thread mode. Since the results are non-sensical forbid
-        * this combination.
-        */
-       if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
-            ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
-             event->attr.precise_ip > 0))
-               return -EOPNOTSUPP;
-
-       if (event_is_checkpointed(event)) {
-               /*
-                * Sampling of checkpointed events can cause situations where
-                * the CPU constantly aborts because of a overflow, which is
-                * then checkpointed back and ignored. Forbid checkpointing
-                * for sampling.
-                *
-                * But still allow a long sampling period, so that perf stat
-                * from KVM works.
-                */
-               if (event->attr.sample_period > 0 &&
-                   event->attr.sample_period < 0x7fffffff)
-                       return -EOPNOTSUPP;
-       }
-       return 0;
-}
-
-static struct event_constraint counter2_constraint =
-                       EVENT_CONSTRAINT(0, 0x4, 0);
-
-static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       c = intel_get_event_constraints(cpuc, idx, event);
-
-       /* Handle special quirk on in_tx_checkpointed only in counter 2 */
-       if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-               if (c->idxmsk64 & (1U << 2))
-                       return &counter2_constraint;
-               return &emptyconstraint;
-       }
-
-       return c;
-}
-
-/*
- * Broadwell:
- *
- * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
- * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
- * the two to enforce a minimum period of 128 (the smallest value that has bits
- * 0-5 cleared and >= 100).
- *
- * Because of how the code in x86_perf_event_set_period() works, the truncation
- * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
- * to make up for the 'lost' events due to carrying the 'error' in period_left.
- *
- * Therefore the effective (average) period matches the requested period,
- * despite coarser hardware granularity.
- */
-static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
-{
-       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
-                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
-               if (left < 128)
-                       left = 128;
-               left &= ~0x3fu;
-       }
-       return left;
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(pc,    "config:19"     );
-PMU_FORMAT_ATTR(any,   "config:21"     ); /* v3 + */
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-PMU_FORMAT_ATTR(in_tx,  "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
-
-static struct attribute *intel_arch_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-ssize_t intel_event_sysfs_show(char *page, u64 config)
-{
-       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
-
-       return x86_event_sysfs_show(page, config, event);
-}
-
-struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-       struct intel_shared_regs *regs;
-       int i;
-
-       regs = kzalloc_node(sizeof(struct intel_shared_regs),
-                           GFP_KERNEL, cpu_to_node(cpu));
-       if (regs) {
-               /*
-                * initialize the locks to keep lockdep happy
-                */
-               for (i = 0; i < EXTRA_REG_MAX; i++)
-                       raw_spin_lock_init(&regs->regs[i].lock);
-
-               regs->core_id = -1;
-       }
-       return regs;
-}
-
-static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
-{
-       struct intel_excl_cntrs *c;
-
-       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
-                        GFP_KERNEL, cpu_to_node(cpu));
-       if (c) {
-               raw_spin_lock_init(&c->lock);
-               c->core_id = -1;
-       }
-       return c;
-}
-
-static int intel_pmu_cpu_prepare(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
-               cpuc->shared_regs = allocate_shared_regs(cpu);
-               if (!cpuc->shared_regs)
-                       goto err;
-       }
-
-       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
-
-               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
-               if (!cpuc->constraint_list)
-                       goto err_shared_regs;
-
-               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-               if (!cpuc->excl_cntrs)
-                       goto err_constraint_list;
-
-               cpuc->excl_thread_id = 0;
-       }
-
-       return NOTIFY_OK;
-
-err_constraint_list:
-       kfree(cpuc->constraint_list);
-       cpuc->constraint_list = NULL;
-
-err_shared_regs:
-       kfree(cpuc->shared_regs);
-       cpuc->shared_regs = NULL;
-
-err:
-       return NOTIFY_BAD;
-}
-
-static void intel_pmu_cpu_starting(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       int core_id = topology_core_id(cpu);
-       int i;
-
-       init_debug_store_on_cpu(cpu);
-       /*
-        * Deal with CPUs that don't clear their LBRs on power-up.
-        */
-       intel_pmu_lbr_reset();
-
-       cpuc->lbr_sel = NULL;
-
-       if (!cpuc->shared_regs)
-               return;
-
-       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
-               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-                       struct intel_shared_regs *pc;
-
-                       pc = per_cpu(cpu_hw_events, i).shared_regs;
-                       if (pc && pc->core_id == core_id) {
-                               cpuc->kfree_on_online[0] = cpuc->shared_regs;
-                               cpuc->shared_regs = pc;
-                               break;
-                       }
-               }
-               cpuc->shared_regs->core_id = core_id;
-               cpuc->shared_regs->refcnt++;
-       }
-
-       if (x86_pmu.lbr_sel_map)
-               cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
-
-       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-                       struct intel_excl_cntrs *c;
-
-                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
-                       if (c && c->core_id == core_id) {
-                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
-                               cpuc->excl_cntrs = c;
-                               cpuc->excl_thread_id = 1;
-                               break;
-                       }
-               }
-               cpuc->excl_cntrs->core_id = core_id;
-               cpuc->excl_cntrs->refcnt++;
-       }
-}
-
-static void free_excl_cntrs(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       struct intel_excl_cntrs *c;
-
-       c = cpuc->excl_cntrs;
-       if (c) {
-               if (c->core_id == -1 || --c->refcnt == 0)
-                       kfree(c);
-               cpuc->excl_cntrs = NULL;
-               kfree(cpuc->constraint_list);
-               cpuc->constraint_list = NULL;
-       }
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       struct intel_shared_regs *pc;
-
-       pc = cpuc->shared_regs;
-       if (pc) {
-               if (pc->core_id == -1 || --pc->refcnt == 0)
-                       kfree(pc);
-               cpuc->shared_regs = NULL;
-       }
-
-       free_excl_cntrs(cpu);
-
-       fini_debug_store_on_cpu(cpu);
-}
-
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
-                                bool sched_in)
-{
-       if (x86_pmu.pebs_active)
-               intel_pmu_pebs_sched_task(ctx, sched_in);
-       if (x86_pmu.lbr_nr)
-               intel_pmu_lbr_sched_task(ctx, sched_in);
-}
-
-PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
-
-PMU_FORMAT_ATTR(ldlat, "config1:0-15");
-
-PMU_FORMAT_ATTR(frontend, "config1:0-23");
-
-static struct attribute *intel_arch3_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_any.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       &format_attr_in_tx.attr,
-       &format_attr_in_tx_cp.attr,
-
-       &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-       &format_attr_ldlat.attr, /* PEBS load latency */
-       NULL,
-};
-
-static struct attribute *skl_format_attr[] = {
-       &format_attr_frontend.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu core_pmu = {
-       .name                   = "core",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = x86_pmu_disable_all,
-       .enable_all             = core_pmu_enable_all,
-       .enable                 = core_pmu_enable_event,
-       .disable                = x86_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
-
-       /*
-        * Intel PMCs cannot be accessed sanely above 32-bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic event period:
-        */
-       .max_period             = (1ULL<<31) - 1,
-       .get_event_constraints  = intel_get_event_constraints,
-       .put_event_constraints  = intel_put_event_constraints,
-       .event_constraints      = intel_core_event_constraints,
-       .guest_get_msrs         = core_guest_get_msrs,
-       .format_attrs           = intel_arch_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-       /*
-        * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
-        * together with PMU version 1 and thus be using core_pmu with
-        * shared_regs. We need following callbacks here to allocate
-        * it properly.
-        */
-       .cpu_prepare            = intel_pmu_cpu_prepare,
-       .cpu_starting           = intel_pmu_cpu_starting,
-       .cpu_dying              = intel_pmu_cpu_dying,
-};
-
-static __initconst const struct x86_pmu intel_pmu = {
-       .name                   = "Intel",
-       .handle_irq             = intel_pmu_handle_irq,
-       .disable_all            = intel_pmu_disable_all,
-       .enable_all             = intel_pmu_enable_all,
-       .enable                 = intel_pmu_enable_event,
-       .disable                = intel_pmu_disable_event,
-       .hw_config              = intel_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
-       /*
-        * Intel PMCs cannot be accessed sanely above 32 bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic event period:
-        */
-       .max_period             = (1ULL << 31) - 1,
-       .get_event_constraints  = intel_get_event_constraints,
-       .put_event_constraints  = intel_put_event_constraints,
-       .pebs_aliases           = intel_pebs_aliases_core2,
-
-       .format_attrs           = intel_arch3_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-       .cpu_prepare            = intel_pmu_cpu_prepare,
-       .cpu_starting           = intel_pmu_cpu_starting,
-       .cpu_dying              = intel_pmu_cpu_dying,
-       .guest_get_msrs         = intel_guest_get_msrs,
-       .sched_task             = intel_pmu_sched_task,
-};
-
-static __init void intel_clovertown_quirk(void)
-{
-       /*
-        * PEBS is unreliable due to:
-        *
-        *   AJ67  - PEBS may experience CPL leaks
-        *   AJ68  - PEBS PMI may be delayed by one event
-        *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
-        *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
-        *
-        * AJ67 could be worked around by restricting the OS/USR flags.
-        * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
-        *
-        * AJ106 could possibly be worked around by not allowing LBR
-        *       usage from PEBS, including the fixup.
-        * AJ68  could possibly be worked around by always programming
-        *       a pebs_event_reset[0] value and coping with the lost events.
-        *
-        * But taken together it might just make sense to not enable PEBS on
-        * these chips.
-        */
-       pr_warn("PEBS disabled due to CPU errata\n");
-       x86_pmu.pebs = 0;
-       x86_pmu.pebs_constraints = NULL;
-}
-
-static int intel_snb_pebs_broken(int cpu)
-{
-       u32 rev = UINT_MAX; /* default to broken for unknown models */
-
-       switch (cpu_data(cpu).x86_model) {
-       case 42: /* SNB */
-               rev = 0x28;
-               break;
-
-       case 45: /* SNB-EP */
-               switch (cpu_data(cpu).x86_mask) {
-               case 6: rev = 0x618; break;
-               case 7: rev = 0x70c; break;
-               }
-       }
-
-       return (cpu_data(cpu).microcode < rev);
-}
-
-static void intel_snb_check_microcode(void)
-{
-       int pebs_broken = 0;
-       int cpu;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu) {
-               if ((pebs_broken = intel_snb_pebs_broken(cpu)))
-                       break;
-       }
-       put_online_cpus();
-
-       if (pebs_broken == x86_pmu.pebs_broken)
-               return;
-
-       /*
-        * Serialized by the microcode lock..
-        */
-       if (x86_pmu.pebs_broken) {
-               pr_info("PEBS enabled due to microcode update\n");
-               x86_pmu.pebs_broken = 0;
-       } else {
-               pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
-               x86_pmu.pebs_broken = 1;
-       }
-}
-
-/*
- * Under certain circumstances, access certain MSR may cause #GP.
- * The function tests if the input MSR can be safely accessed.
- */
-static bool check_msr(unsigned long msr, u64 mask)
-{
-       u64 val_old, val_new, val_tmp;
-
-       /*
-        * Read the current value, change it and read it back to see if it
-        * matches, this is needed to detect certain hardware emulators
-        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-        */
-       if (rdmsrl_safe(msr, &val_old))
-               return false;
-
-       /*
-        * Only change the bits which can be updated by wrmsrl.
-        */
-       val_tmp = val_old ^ mask;
-       if (wrmsrl_safe(msr, val_tmp) ||
-           rdmsrl_safe(msr, &val_new))
-               return false;
-
-       if (val_new != val_tmp)
-               return false;
-
-       /* Here it's sure that the MSR can be safely accessed.
-        * Restore the old value and return.
-        */
-       wrmsrl(msr, val_old);
-
-       return true;
-}
-
-static __init void intel_sandybridge_quirk(void)
-{
-       x86_pmu.check_microcode = intel_snb_check_microcode;
-       intel_snb_check_microcode();
-}
-
-static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
-       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
-       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
-       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
-       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
-       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
-       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
-       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
-};
-
-static __init void intel_arch_events_quirk(void)
-{
-       int bit;
-
-       /* disable event that reported as not presend by cpuid */
-       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
-               pr_warn("CPUID marked event: \'%s\' unavailable\n",
-                       intel_arch_events_map[bit].name);
-       }
-}
-
-static __init void intel_nehalem_quirk(void)
-{
-       union cpuid10_ebx ebx;
-
-       ebx.full = x86_pmu.events_maskl;
-       if (ebx.split.no_branch_misses_retired) {
-               /*
-                * Erratum AAJ80 detected, we work it around by using
-                * the BR_MISP_EXEC.ANY event. This will over-count
-                * branch-misses, but it's still much better than the
-                * architectural event which is often completely bogus:
-                */
-               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
-               ebx.split.no_branch_misses_retired = 0;
-               x86_pmu.events_maskl = ebx.full;
-               pr_info("CPU erratum AAJ80 worked around\n");
-       }
-}
-
-/*
- * enable software workaround for errata:
- * SNB: BJ122
- * IVB: BV98
- * HSW: HSD29
- *
- * Only needed when HT is enabled. However detecting
- * if HT is enabled is difficult (model specific). So instead,
- * we enable the workaround in the early boot, and verify if
- * it is needed in a later initcall phase once we have valid
- * topology information to check if HT is actually enabled
- */
-static __init void intel_ht_bug(void)
-{
-       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
-
-       x86_pmu.start_scheduling = intel_start_scheduling;
-       x86_pmu.commit_scheduling = intel_commit_scheduling;
-       x86_pmu.stop_scheduling = intel_stop_scheduling;
-}
-
-EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
-
-/* Haswell special events */
-EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
-EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
-EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
-EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
-EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
-EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
-EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
-EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
-EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
-EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
-EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
-EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
-
-static struct attribute *hsw_events_attrs[] = {
-       EVENT_PTR(tx_start),
-       EVENT_PTR(tx_commit),
-       EVENT_PTR(tx_abort),
-       EVENT_PTR(tx_capacity),
-       EVENT_PTR(tx_conflict),
-       EVENT_PTR(el_start),
-       EVENT_PTR(el_commit),
-       EVENT_PTR(el_abort),
-       EVENT_PTR(el_capacity),
-       EVENT_PTR(el_conflict),
-       EVENT_PTR(cycles_t),
-       EVENT_PTR(cycles_ct),
-       EVENT_PTR(mem_ld_hsw),
-       EVENT_PTR(mem_st_hsw),
-       NULL
-};
-
-__init int intel_pmu_init(void)
-{
-       union cpuid10_edx edx;
-       union cpuid10_eax eax;
-       union cpuid10_ebx ebx;
-       struct event_constraint *c;
-       unsigned int unused;
-       struct extra_reg *er;
-       int version, i;
-
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-               switch (boot_cpu_data.x86) {
-               case 0x6:
-                       return p6_pmu_init();
-               case 0xb:
-                       return knc_pmu_init();
-               case 0xf:
-                       return p4_pmu_init();
-               }
-               return -ENODEV;
-       }
-
-       /*
-        * Check whether the Architectural PerfMon supports
-        * Branch Misses Retired hw_event or not.
-        */
-       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
-       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
-               return -ENODEV;
-
-       version = eax.split.version_id;
-       if (version < 2)
-               x86_pmu = core_pmu;
-       else
-               x86_pmu = intel_pmu;
-
-       x86_pmu.version                 = version;
-       x86_pmu.num_counters            = eax.split.num_counters;
-       x86_pmu.cntval_bits             = eax.split.bit_width;
-       x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
-
-       x86_pmu.events_maskl            = ebx.full;
-       x86_pmu.events_mask_len         = eax.split.mask_length;
-
-       x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
-       /*
-        * Quirk: v2 perfmon does not report fixed-purpose events, so
-        * assume at least 3 events:
-        */
-       if (version > 1)
-               x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-       if (boot_cpu_has(X86_FEATURE_PDCM)) {
-               u64 capabilities;
-
-               rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-               x86_pmu.intel_cap.capabilities = capabilities;
-       }
-
-       intel_ds_init();
-
-       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
-
-       /*
-        * Install the hw-cache-events table:
-        */
-       switch (boot_cpu_data.x86_model) {
-       case 14: /* 65nm Core "Yonah" */
-               pr_cont("Core events, ");
-               break;
-
-       case 15: /* 65nm Core2 "Merom"          */
-               x86_add_quirk(intel_clovertown_quirk);
-       case 22: /* 65nm Core2 "Merom-L"        */
-       case 23: /* 45nm Core2 "Penryn"         */
-       case 29: /* 45nm Core2 "Dunnington (MP) */
-               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               intel_pmu_lbr_init_core();
-
-               x86_pmu.event_constraints = intel_core2_event_constraints;
-               x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
-               pr_cont("Core2 events, ");
-               break;
-
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_nhm();
-
-               x86_pmu.event_constraints = intel_nehalem_event_constraints;
-               x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-               x86_pmu.extra_regs = intel_nehalem_extra_regs;
-
-               x86_pmu.cpu_events = nhm_events_attrs;
-
-               /* UOPS_ISSUED.STALLED_CYCLES */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-               x86_add_quirk(intel_nehalem_quirk);
-
-               pr_cont("Nehalem events, ");
-               break;
-
-       case 28: /* 45nm Atom "Pineview"   */
-       case 38: /* 45nm Atom "Lincroft"   */
-       case 39: /* 32nm Atom "Penwell"    */
-       case 53: /* 32nm Atom "Cloverview" */
-       case 54: /* 32nm Atom "Cedarview"  */
-               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               intel_pmu_lbr_init_atom();
-
-               x86_pmu.event_constraints = intel_gen_event_constraints;
-               x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
-               pr_cont("Atom events, ");
-               break;
-
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 76: /* 14nm Atom "Airmont"                   */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-               memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
-                       sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_atom();
-
-               x86_pmu.event_constraints = intel_slm_event_constraints;
-               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_slm_extra_regs;
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               pr_cont("Silvermont events, ");
-               break;
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_nhm();
-
-               x86_pmu.event_constraints = intel_westmere_event_constraints;
-               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-               x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_westmere_extra_regs;
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-
-               x86_pmu.cpu_events = nhm_events_attrs;
-
-               /* UOPS_ISSUED.STALLED_CYCLES */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-               pr_cont("Westmere events, ");
-               break;
-
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-               x86_add_quirk(intel_sandybridge_quirk);
-               x86_add_quirk(intel_ht_bug);
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_snb();
-
-               x86_pmu.event_constraints = intel_snb_event_constraints;
-               x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-               if (boot_cpu_data.x86_model == 45)
-                       x86_pmu.extra_regs = intel_snbep_extra_regs;
-               else
-                       x86_pmu.extra_regs = intel_snb_extra_regs;
-
-
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.cpu_events = snb_events_attrs;
-
-               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
-
-               pr_cont("SandyBridge events, ");
-               break;
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-               x86_add_quirk(intel_ht_bug);
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               /* dTLB-load-misses on IVB is different than SNB */
-               hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
-
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_snb();
-
-               x86_pmu.event_constraints = intel_ivb_event_constraints;
-               x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               if (boot_cpu_data.x86_model == 62)
-                       x86_pmu.extra_regs = intel_snbep_extra_regs;
-               else
-                       x86_pmu.extra_regs = intel_snb_extra_regs;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.cpu_events = snb_events_attrs;
-
-               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-
-               pr_cont("IvyBridge events, ");
-               break;
-
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-               x86_add_quirk(intel_ht_bug);
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_hsw();
-
-               x86_pmu.event_constraints = intel_hsw_event_constraints;
-               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_snbep_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.cpu_events = hsw_events_attrs;
-               x86_pmu.lbr_double_abort = true;
-               pr_cont("Haswell events, ");
-               break;
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
-               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
-                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
-                                                                         HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
-                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
-                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-
-               intel_pmu_lbr_init_hsw();
-
-               x86_pmu.event_constraints = intel_bdw_event_constraints;
-               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_snbep_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.cpu_events = hsw_events_attrs;
-               x86_pmu.limit_period = bdw_limit_period;
-               pr_cont("Broadwell events, ");
-               break;
-
-       case 87: /* Knights Landing Xeon Phi */
-               memcpy(hw_cache_event_ids,
-                      slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs,
-                      knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-               intel_pmu_lbr_init_knl();
-
-               x86_pmu.event_constraints = intel_slm_event_constraints;
-               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_knl_extra_regs;
-
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               pr_cont("Knights Landing events, ");
-               break;
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-               intel_pmu_lbr_init_skl();
-
-               x86_pmu.event_constraints = intel_skl_event_constraints;
-               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_skl_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-                                                 skl_format_attr);
-               WARN_ON(!x86_pmu.format_attrs);
-               x86_pmu.cpu_events = hsw_events_attrs;
-               pr_cont("Skylake events, ");
-               break;
-
-       default:
-               switch (x86_pmu.version) {
-               case 1:
-                       x86_pmu.event_constraints = intel_v1_event_constraints;
-                       pr_cont("generic architected perfmon v1, ");
-                       break;
-               default:
-                       /*
-                        * default constraints for v2 and up
-                        */
-                       x86_pmu.event_constraints = intel_gen_event_constraints;
-                       pr_cont("generic architected perfmon, ");
-                       break;
-               }
-       }
-
-       if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-                    x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-               x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-       }
-       x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-       if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-                    x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-               x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-       }
-
-       x86_pmu.intel_ctrl |=
-               ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
-
-       if (x86_pmu.event_constraints) {
-               /*
-                * event on fixed counter2 (REF_CYCLES) only works on this
-                * counter, so do not extend mask to generic counters
-                */
-               for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if (c->cmask == FIXED_EVENT_FLAGS
-                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-                       }
-                       c->idxmsk64 &=
-                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
-                       c->weight = hweight64(c->idxmsk64);
-               }
-       }
-
-       /*
-        * Access LBR MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support LBR MSR
-        * Check all LBT MSR here.
-        * Disable LBR access if any LBR MSRs can not be accessed.
-        */
-       if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
-               x86_pmu.lbr_nr = 0;
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
-                     check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
-                       x86_pmu.lbr_nr = 0;
-       }
-
-       /*
-        * Access extra MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support offcore event
-        * Check all extra_regs here.
-        */
-       if (x86_pmu.extra_regs) {
-               for (er = x86_pmu.extra_regs; er->msr; er++) {
-                       er->extra_msr_access = check_msr(er->msr, 0x11UL);
-                       /* Disable LBR select mapping */
-                       if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-                               x86_pmu.lbr_sel_map = NULL;
-               }
-       }
-
-       /* Support full width counters using alternative MSR range */
-       if (x86_pmu.intel_cap.full_width_write) {
-               x86_pmu.max_period = x86_pmu.cntval_mask;
-               x86_pmu.perfctr = MSR_IA32_PMC0;
-               pr_cont("full-width counters, ");
-       }
-
-       return 0;
-}
-
-/*
- * HT bug: phase 2 init
- * Called once we have valid topology information to check
- * whether or not HT is enabled
- * If HT is off, then we disable the workaround
- */
-static __init int fixup_ht_bug(void)
-{
-       int cpu = smp_processor_id();
-       int w, c;
-       /*
-        * problem not present on this CPU model, nothing to do
-        */
-       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
-               return 0;
-
-       w = cpumask_weight(topology_sibling_cpumask(cpu));
-       if (w > 1) {
-               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
-               return 0;
-       }
-
-       if (lockup_detector_suspend() != 0) {
-               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-               return 0;
-       }
-
-       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
-
-       x86_pmu.start_scheduling = NULL;
-       x86_pmu.commit_scheduling = NULL;
-       x86_pmu.stop_scheduling = NULL;
-
-       lockup_detector_resume();
-
-       get_online_cpus();
-
-       for_each_online_cpu(c) {
-               free_excl_cntrs(c);
-       }
-
-       put_online_cpus();
-       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
-       return 0;
-}
-subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c

deleted file mode 100644 (file)

index 2cad71d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * BTS PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/coredump.h>
-
-#include <asm-generic/sizes.h>
-#include <asm/perf_event.h>
-
-#include "perf_event.h"
-
-struct bts_ctx {
-       struct perf_output_handle       handle;
-       struct debug_store              ds_back;
-       int                             started;
-};
-
-static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
-
-#define BTS_RECORD_SIZE                24
-#define BTS_SAFETY_MARGIN      4080
-
-struct bts_phys {
-       struct page     *page;
-       unsigned long   size;
-       unsigned long   offset;
-       unsigned long   displacement;
-};
-
-struct bts_buffer {
-       size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
-       unsigned int    nr_pages;
-       unsigned int    nr_bufs;
-       unsigned int    cur_buf;
-       bool            snapshot;
-       local_t         data_size;
-       local_t         lost;
-       local_t         head;
-       unsigned long   end;
-       void            **data_pages;
-       struct bts_phys buf[0];
-};
-
-struct pmu bts_pmu;
-
-static size_t buf_size(struct page *page)
-{
-       return 1 << (PAGE_SHIFT + page_private(page));
-}
-
-static void *
-bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
-{
-       struct bts_buffer *buf;
-       struct page *page;
-       int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-       unsigned long offset;
-       size_t size = nr_pages << PAGE_SHIFT;
-       int pg, nbuf, pad;
-
-       /* count all the high order buffers */
-       for (pg = 0, nbuf = 0; pg < nr_pages;) {
-               page = virt_to_page(pages[pg]);
-               if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
-                       return NULL;
-               pg += 1 << page_private(page);
-               nbuf++;
-       }
-
-       /*
-        * to avoid interrupts in overwrite mode, only allow one physical
-        */
-       if (overwrite && nbuf > 1)
-               return NULL;
-
-       buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
-       if (!buf)
-               return NULL;
-
-       buf->nr_pages = nr_pages;
-       buf->nr_bufs = nbuf;
-       buf->snapshot = overwrite;
-       buf->data_pages = pages;
-       buf->real_size = size - size % BTS_RECORD_SIZE;
-
-       for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
-               unsigned int __nr_pages;
-
-               page = virt_to_page(pages[pg]);
-               __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
-               buf->buf[nbuf].page = page;
-               buf->buf[nbuf].offset = offset;
-               buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
-               buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
-               pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
-               buf->buf[nbuf].size -= pad;
-
-               pg += __nr_pages;
-               offset += __nr_pages << PAGE_SHIFT;
-       }
-
-       return buf;
-}
-
-static void bts_buffer_free_aux(void *data)
-{
-       kfree(data);
-}
-
-static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
-{
-       return buf->buf[idx].offset + buf->buf[idx].displacement;
-}
-
-static void
-bts_config_buffer(struct bts_buffer *buf)
-{
-       int cpu = raw_smp_processor_id();
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       struct bts_phys *phys = &buf->buf[buf->cur_buf];
-       unsigned long index, thresh = 0, end = phys->size;
-       struct page *page = phys->page;
-
-       index = local_read(&buf->head);
-
-       if (!buf->snapshot) {
-               if (buf->end < phys->offset + buf_size(page))
-                       end = buf->end - phys->offset - phys->displacement;
-
-               index -= phys->offset + phys->displacement;
-
-               if (end - index > BTS_SAFETY_MARGIN)
-                       thresh = end - BTS_SAFETY_MARGIN;
-               else if (end - index > BTS_RECORD_SIZE)
-                       thresh = end - BTS_RECORD_SIZE;
-               else
-                       thresh = end;
-       }
-
-       ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
-       ds->bts_index = ds->bts_buffer_base + index;
-       ds->bts_absolute_maximum = ds->bts_buffer_base + end;
-       ds->bts_interrupt_threshold = !buf->snapshot
-               ? ds->bts_buffer_base + thresh
-               : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
-}
-
-static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
-{
-       unsigned long index = head - phys->offset;
-
-       memset(page_address(phys->page) + index, 0, phys->size - index);
-}
-
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
-       if (buf->snapshot)
-               return false;
-
-       if (local_read(&buf->data_size) >= bts->handle.size ||
-           bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
-               return true;
-
-       return false;
-}
-
-static void bts_update(struct bts_ctx *bts)
-{
-       int cpu = raw_smp_processor_id();
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-       unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
-
-       if (!buf)
-               return;
-
-       head = index + bts_buffer_offset(buf, buf->cur_buf);
-       old = local_xchg(&buf->head, head);
-
-       if (!buf->snapshot) {
-               if (old == head)
-                       return;
-
-               if (ds->bts_index >= ds->bts_absolute_maximum)
-                       local_inc(&buf->lost);
-
-               /*
-                * old and head are always in the same physical buffer, so we
-                * can subtract them to get the data size.
-                */
-               local_add(head - old, &buf->data_size);
-       } else {
-               local_set(&buf->data_size, head);
-       }
-}
-
-static void __bts_event_start(struct perf_event *event)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-       u64 config = 0;
-
-       if (!buf || bts_buffer_is_full(buf, bts))
-               return;
-
-       event->hw.itrace_started = 1;
-       event->hw.state = 0;
-
-       if (!buf->snapshot)
-               config |= ARCH_PERFMON_EVENTSEL_INT;
-       if (!event->attr.exclude_kernel)
-               config |= ARCH_PERFMON_EVENTSEL_OS;
-       if (!event->attr.exclude_user)
-               config |= ARCH_PERFMON_EVENTSEL_USR;
-
-       bts_config_buffer(buf);
-
-       /*
-        * local barrier to make sure that ds configuration made it
-        * before we enable BTS
-        */
-       wmb();
-
-       intel_pmu_enable_bts(config);
-}
-
-static void bts_event_start(struct perf_event *event, int flags)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       __bts_event_start(event);
-
-       /* PMI handler: this counter is running and likely generating PMIs */
-       ACCESS_ONCE(bts->started) = 1;
-}
-
-static void __bts_event_stop(struct perf_event *event)
-{
-       /*
-        * No extra synchronization is mandated by the documentation to have
-        * BTS data stores globally visible.
-        */
-       intel_pmu_disable_bts();
-
-       if (event->hw.state & PERF_HES_STOPPED)
-               return;
-
-       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
-}
-
-static void bts_event_stop(struct perf_event *event, int flags)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       /* PMI handler: don't restart this counter */
-       ACCESS_ONCE(bts->started) = 0;
-
-       __bts_event_stop(event);
-
-       if (flags & PERF_EF_UPDATE)
-               bts_update(bts);
-}
-
-void intel_bts_enable_local(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       if (bts->handle.event && bts->started)
-               __bts_event_start(bts->handle.event);
-}
-
-void intel_bts_disable_local(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       if (bts->handle.event)
-               __bts_event_stop(bts->handle.event);
-}
-
-static int
-bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
-{
-       unsigned long head, space, next_space, pad, gap, skip, wakeup;
-       unsigned int next_buf;
-       struct bts_phys *phys, *next_phys;
-       int ret;
-
-       if (buf->snapshot)
-               return 0;
-
-       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
-       if (WARN_ON_ONCE(head != local_read(&buf->head)))
-               return -EINVAL;
-
-       phys = &buf->buf[buf->cur_buf];
-       space = phys->offset + phys->displacement + phys->size - head;
-       pad = space;
-       if (space > handle->size) {
-               space = handle->size;
-               space -= space % BTS_RECORD_SIZE;
-       }
-       if (space <= BTS_SAFETY_MARGIN) {
-               /* See if next phys buffer has more space */
-               next_buf = buf->cur_buf + 1;
-               if (next_buf >= buf->nr_bufs)
-                       next_buf = 0;
-               next_phys = &buf->buf[next_buf];
-               gap = buf_size(phys->page) - phys->displacement - phys->size +
-                     next_phys->displacement;
-               skip = pad + gap;
-               if (handle->size >= skip) {
-                       next_space = next_phys->size;
-                       if (next_space + skip > handle->size) {
-                               next_space = handle->size - skip;
-                               next_space -= next_space % BTS_RECORD_SIZE;
-                       }
-                       if (next_space > space || !space) {
-                               if (pad)
-                                       bts_buffer_pad_out(phys, head);
-                               ret = perf_aux_output_skip(handle, skip);
-                               if (ret)
-                                       return ret;
-                               /* Advance to next phys buffer */
-                               phys = next_phys;
-                               space = next_space;
-                               head = phys->offset + phys->displacement;
-                               /*
-                                * After this, cur_buf and head won't match ds
-                                * anymore, so we must not be racing with
-                                * bts_update().
-                                */
-                               buf->cur_buf = next_buf;
-                               local_set(&buf->head, head);
-                       }
-               }
-       }
-
-       /* Don't go far beyond wakeup watermark */
-       wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
-                handle->head;
-       if (space > wakeup) {
-               space = wakeup;
-               space -= space % BTS_RECORD_SIZE;
-       }
-
-       buf->end = head + space;
-
-       /*
-        * If we have no space, the lost notification would have been sent when
-        * we hit absolute_maximum - see bts_update()
-        */
-       if (!space)
-               return -ENOSPC;
-
-       return 0;
-}
-
-int intel_bts_interrupt(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct perf_event *event = bts->handle.event;
-       struct bts_buffer *buf;
-       s64 old_head;
-       int err;
-
-       if (!event || !bts->started)
-               return 0;
-
-       buf = perf_get_aux(&bts->handle);
-       /*
-        * Skip snapshot counters: they don't use the interrupt, but
-        * there's no other way of telling, because the pointer will
-        * keep moving
-        */
-       if (!buf || buf->snapshot)
-               return 0;
-
-       old_head = local_read(&buf->head);
-       bts_update(bts);
-
-       /* no new data */
-       if (old_head == local_read(&buf->head))
-               return 0;
-
-       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                           !!local_xchg(&buf->lost, 0));
-
-       buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return 1;
-
-       err = bts_buffer_reset(buf, &bts->handle);
-       if (err)
-               perf_aux_output_end(&bts->handle, 0, false);
-
-       return 1;
-}
-
-static void bts_event_del(struct perf_event *event, int mode)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
-       bts_event_stop(event, PERF_EF_UPDATE);
-
-       if (buf) {
-               if (buf->snapshot)
-                       bts->handle.head =
-                               local_xchg(&buf->data_size,
-                                          buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                                   !!local_xchg(&buf->lost, 0));
-       }
-
-       cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
-       cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
-       cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
-       cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
-}
-
-static int bts_event_add(struct perf_event *event, int mode)
-{
-       struct bts_buffer *buf;
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       int ret = -EBUSY;
-
-       event->hw.state = PERF_HES_STOPPED;
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               return -EBUSY;
-
-       if (bts->handle.event)
-               return -EBUSY;
-
-       buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return -EINVAL;
-
-       ret = bts_buffer_reset(buf, &bts->handle);
-       if (ret) {
-               perf_aux_output_end(&bts->handle, 0, false);
-               return ret;
-       }
-
-       bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
-       bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
-       bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
-       if (mode & PERF_EF_START) {
-               bts_event_start(event, 0);
-               if (hwc->state & PERF_HES_STOPPED) {
-                       bts_event_del(event, 0);
-                       return -EBUSY;
-               }
-       }
-
-       return 0;
-}
-
-static void bts_event_destroy(struct perf_event *event)
-{
-       x86_release_hardware();
-       x86_del_exclusive(x86_lbr_exclusive_bts);
-}
-
-static int bts_event_init(struct perf_event *event)
-{
-       int ret;
-
-       if (event->attr.type != bts_pmu.type)
-               return -ENOENT;
-
-       if (x86_add_exclusive(x86_lbr_exclusive_bts))
-               return -EBUSY;
-
-       /*
-        * BTS leaks kernel addresses even when CPL0 tracing is
-        * disabled, so disallow intel_bts driver for unprivileged
-        * users on paranoid systems since it provides trace data
-        * to the user in a zero-copy fashion.
-        *
-        * Note that the default paranoia setting permits unprivileged
-        * users to profile the kernel.
-        */
-       if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-           !capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       ret = x86_reserve_hardware();
-       if (ret) {
-               x86_del_exclusive(x86_lbr_exclusive_bts);
-               return ret;
-       }
-
-       event->destroy = bts_event_destroy;
-
-       return 0;
-}
-
-static void bts_event_read(struct perf_event *event)
-{
-}
-
-static __init int bts_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
-               return -ENODEV;
-
-       bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
-       bts_pmu.task_ctx_nr     = perf_sw_context;
-       bts_pmu.event_init      = bts_event_init;
-       bts_pmu.add             = bts_event_add;
-       bts_pmu.del             = bts_event_del;
-       bts_pmu.start           = bts_event_start;
-       bts_pmu.stop            = bts_event_stop;
-       bts_pmu.read            = bts_event_read;
-       bts_pmu.setup_aux       = bts_buffer_setup_aux;
-       bts_pmu.free_aux        = bts_buffer_free_aux;
-
-       return perf_pmu_register(&bts_pmu, "intel_bts", -1);
-}
-arch_initcall(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c

deleted file mode 100644 (file)

index a316ca9..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ /dev/null
@@ -1,1391 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define MSR_IA32_PQR_ASSOC     0x0c8f
-#define MSR_IA32_QM_CTR                0x0c8e
-#define MSR_IA32_QM_EVTSEL     0x0c8d
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:              The cached Resource Monitoring ID
- * @closid:            The cached Class Of Service ID
- * @rmid_usecnt:       The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-       u32                     rmid;
-       u32                     closid;
-       int                     rmid_usecnt;
-};
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR         (1ULL << 63)
-#define RMID_VAL_UNAVAIL       (1ULL << 62)
-
-#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
-
-#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID           (-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-       if (!rmid || rmid == INVALID_RMID)
-               return false;
-
-       return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-       u64 val;
-
-       /*
-        * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-        * it just says that to increase confusion.
-        */
-       wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-       rdmsrl(MSR_IA32_QM_CTR, val);
-
-       /*
-        * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-        * the number of cachelines tagged with @rmid.
-        */
-       return val;
-}
-
-enum rmid_recycle_state {
-       RMID_YOUNG = 0,
-       RMID_AVAILABLE,
-       RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-       u32 rmid;
-       enum rmid_recycle_state state;
-       struct list_head list;
-       unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-       struct cqm_rmid_entry *entry;
-
-       entry = cqm_rmid_ptrs[rmid];
-       WARN_ON(entry->rmid != rmid);
-
-       return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-       struct cqm_rmid_entry *entry;
-
-       lockdep_assert_held(&cache_mutex);
-
-       if (list_empty(&cqm_rmid_free_lru))
-               return INVALID_RMID;
-
-       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-       list_del(&entry->list);
-
-       return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-       struct cqm_rmid_entry *entry;
-
-       lockdep_assert_held(&cache_mutex);
-
-       WARN_ON(!__rmid_valid(rmid));
-       entry = __rmid_entry(rmid);
-
-       entry->queue_time = jiffies;
-       entry->state = RMID_YOUNG;
-
-       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-       struct cqm_rmid_entry *entry;
-       unsigned int nr_rmids;
-       int r = 0;
-
-       nr_rmids = cqm_max_rmid + 1;
-       cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-                               nr_rmids, GFP_KERNEL);
-       if (!cqm_rmid_ptrs)
-               return -ENOMEM;
-
-       for (; r <= cqm_max_rmid; r++) {
-               struct cqm_rmid_entry *entry;
-
-               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-               if (!entry)
-                       goto fail;
-
-               INIT_LIST_HEAD(&entry->list);
-               entry->rmid = r;
-               cqm_rmid_ptrs[r] = entry;
-
-               list_add_tail(&entry->list, &cqm_rmid_free_lru);
-       }
-
-       /*
-        * RMID 0 is special and is always allocated. It's used for all
-        * tasks that are not monitored.
-        */
-       entry = __rmid_entry(0);
-       list_del(&entry->list);
-
-       mutex_lock(&cache_mutex);
-       intel_cqm_rotation_rmid = __get_rmid();
-       mutex_unlock(&cache_mutex);
-
-       return 0;
-fail:
-       while (r--)
-               kfree(cqm_rmid_ptrs[r]);
-
-       kfree(cqm_rmid_ptrs);
-       return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-       /* Per-cpu and task events don't mix */
-       if ((a->attach_state & PERF_ATTACH_TASK) !=
-           (b->attach_state & PERF_ATTACH_TASK))
-               return false;
-
-#ifdef CONFIG_CGROUP_PERF
-       if (a->cgrp != b->cgrp)
-               return false;
-#endif
-
-       /* If not task event, we're machine wide */
-       if (!(b->attach_state & PERF_ATTACH_TASK))
-               return true;
-
-       /*
-        * Events that target same task are placed into the same cache group.
-        */
-       if (a->hw.target == b->hw.target)
-               return true;
-
-       /*
-        * Are we an inherited event?
-        */
-       if (b->parent == a)
-               return true;
-
-       return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-       if (event->attach_state & PERF_ATTACH_TASK)
-               return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-       return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *                PROHIBITS
- *     system-wide    ->       cgroup and task
- *     cgroup        ->        system-wide
- *                           ->        task in cgroup
- *     task          ->        system-wide
- *                           ->        task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-       /*
-        * We can have any number of cgroups but only one system-wide
-        * event at a time.
-        */
-       if (a->cgrp && b->cgrp) {
-               struct perf_cgroup *ac = a->cgrp;
-               struct perf_cgroup *bc = b->cgrp;
-
-               /*
-                * This condition should have been caught in
-                * __match_event() and we should be sharing an RMID.
-                */
-               WARN_ON_ONCE(ac == bc);
-
-               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                       return true;
-
-               return false;
-       }
-
-       if (a->cgrp || b->cgrp) {
-               struct perf_cgroup *ac, *bc;
-
-               /*
-                * cgroup and system-wide events are mutually exclusive
-                */
-               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-                       return true;
-
-               /*
-                * Ensure neither event is part of the other's cgroup
-                */
-               ac = event_to_cgroup(a);
-               bc = event_to_cgroup(b);
-               if (ac == bc)
-                       return true;
-
-               /*
-                * Must have cgroup and non-intersecting task events.
-                */
-               if (!ac || !bc)
-                       return false;
-
-               /*
-                * We have cgroup and task events, and the task belongs
-                * to a cgroup. Check for for overlap.
-                */
-               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                       return true;
-
-               return false;
-       }
-#endif
-       /*
-        * If one of them is not a task, same story as above with cgroups.
-        */
-       if (!(a->attach_state & PERF_ATTACH_TASK) ||
-           !(b->attach_state & PERF_ATTACH_TASK))
-               return true;
-
-       /*
-        * Must be non-overlapping.
-        */
-       return false;
-}
-
-struct rmid_read {
-       u32 rmid;
-       atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-       struct perf_event *event;
-       struct list_head *head = &group->hw.cqm_group_entry;
-       u32 old_rmid = group->hw.cqm_rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       /*
-        * If our RMID is being deallocated, perform a read now.
-        */
-       if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-               struct rmid_read rr = {
-                       .value = ATOMIC64_INIT(0),
-                       .rmid = old_rmid,
-               };
-
-               on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-                                &rr, 1);
-               local64_set(&group->count, atomic64_read(&rr.value));
-       }
-
-       raw_spin_lock_irq(&cache_lock);
-
-       group->hw.cqm_rmid = rmid;
-       list_for_each_entry(event, head, hw.cqm_group_entry)
-               event->hw.cqm_rmid = rmid;
-
-       raw_spin_unlock_irq(&cache_lock);
-
-       return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-       struct cqm_rmid_entry *entry;
-
-       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-               if (entry->state != RMID_AVAILABLE)
-                       break;
-
-               if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-                       entry->state = RMID_DIRTY;
-       }
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-       struct perf_event *leader, *event;
-
-       lockdep_assert_held(&cache_mutex);
-
-       leader = list_first_entry(&cache_groups, struct perf_event,
-                                 hw.cqm_groups_entry);
-       event = leader;
-
-       list_for_each_entry_continue(event, &cache_groups,
-                                    hw.cqm_groups_entry) {
-               if (__rmid_valid(event->hw.cqm_rmid))
-                       continue;
-
-               if (__conflict_event(event, leader))
-                       continue;
-
-               intel_cqm_xchg_rmid(event, rmid);
-               return true;
-       }
-
-       return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250    /* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-       struct cqm_rmid_entry *entry, *tmp;
-
-       lockdep_assert_held(&cache_mutex);
-
-       *available = 0;
-       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-               unsigned long min_queue_time;
-               unsigned long now = jiffies;
-
-               /*
-                * We hold RMIDs placed into limbo for a minimum queue
-                * time. Before the minimum queue time has elapsed we do
-                * not recycle RMIDs.
-                *
-                * The reasoning is that until a sufficient time has
-                * passed since we stopped using an RMID, any RMID
-                * placed onto the limbo list will likely still have
-                * data tagged in the cache, which means we'll probably
-                * fail to recycle it anyway.
-                *
-                * We can save ourselves an expensive IPI by skipping
-                * any RMIDs that have not been queued for the minimum
-                * time.
-                */
-               min_queue_time = entry->queue_time +
-                       msecs_to_jiffies(__rmid_queue_time_ms);
-
-               if (time_after(min_queue_time, now))
-                       break;
-
-               entry->state = RMID_AVAILABLE;
-               (*available)++;
-       }
-
-       /*
-        * Fast return if none of the RMIDs on the limbo list have been
-        * sitting on the queue for the minimum queue time.
-        */
-       if (!*available)
-               return false;
-
-       /*
-        * Test whether an RMID is free for each package.
-        */
-       on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-               /*
-                * Exhausted all RMIDs that have waited min queue time.
-                */
-               if (entry->state == RMID_YOUNG)
-                       break;
-
-               if (entry->state == RMID_DIRTY)
-                       continue;
-
-               list_del(&entry->list); /* remove from limbo */
-
-               /*
-                * The rotation RMID gets priority if it's
-                * currently invalid. In which case, skip adding
-                * the RMID to the the free lru.
-                */
-               if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-                       intel_cqm_rotation_rmid = entry->rmid;
-                       continue;
-               }
-
-               /*
-                * If we have groups waiting for RMIDs, hand
-                * them one now provided they don't conflict.
-                */
-               if (intel_cqm_sched_in_event(entry->rmid))
-                       continue;
-
-               /*
-                * Otherwise place it onto the free list.
-                */
-               list_add_tail(&entry->list, &cqm_rmid_free_lru);
-       }
-
-
-       return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-       struct perf_event *rotor;
-       u32 rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       rotor = list_first_entry(&cache_groups, struct perf_event,
-                                hw.cqm_groups_entry);
-
-       /*
-        * The group at the front of the list should always have a valid
-        * RMID. If it doesn't then no groups have RMIDs assigned and we
-        * don't need to rotate the list.
-        */
-       if (next == rotor)
-               return;
-
-       rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-       __put_rmid(rmid);
-
-       list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-       struct perf_event *group, *g;
-       u32 rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-               if (group == event)
-                       continue;
-
-               rmid = group->hw.cqm_rmid;
-
-               /*
-                * Skip events that don't have a valid RMID.
-                */
-               if (!__rmid_valid(rmid))
-                       continue;
-
-               /*
-                * No conflict? No problem! Leave the event alone.
-                */
-               if (!__conflict_event(group, event))
-                       continue;
-
-               intel_cqm_xchg_rmid(group, INVALID_RMID);
-               __put_rmid(rmid);
-       }
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-       struct perf_event *group, *start = NULL;
-       unsigned int threshold_limit;
-       unsigned int nr_needed = 0;
-       unsigned int nr_available;
-       bool rotated = false;
-
-       mutex_lock(&cache_mutex);
-
-again:
-       /*
-        * Fast path through this function if there are no groups and no
-        * RMIDs that need cleaning.
-        */
-       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-               goto out;
-
-       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-               if (!__rmid_valid(group->hw.cqm_rmid)) {
-                       if (!start)
-                               start = group;
-                       nr_needed++;
-               }
-       }
-
-       /*
-        * We have some event groups, but they all have RMIDs assigned
-        * and no RMIDs need cleaning.
-        */
-       if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-               goto out;
-
-       if (!nr_needed)
-               goto stabilize;
-
-       /*
-        * We have more event groups without RMIDs than available RMIDs,
-        * or we have event groups that conflict with the ones currently
-        * scheduled.
-        *
-        * We force deallocate the rmid of the group at the head of
-        * cache_groups. The first event group without an RMID then gets
-        * assigned intel_cqm_rotation_rmid. This ensures we always make
-        * forward progress.
-        *
-        * Rotate the cache_groups list so the previous head is now the
-        * tail.
-        */
-       __intel_cqm_pick_and_rotate(start);
-
-       /*
-        * If the rotation is going to succeed, reduce the threshold so
-        * that we don't needlessly reuse dirty RMIDs.
-        */
-       if (__rmid_valid(intel_cqm_rotation_rmid)) {
-               intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-               intel_cqm_rotation_rmid = __get_rmid();
-
-               intel_cqm_sched_out_conflicting_events(start);
-
-               if (__intel_cqm_threshold)
-                       __intel_cqm_threshold--;
-       }
-
-       rotated = true;
-
-stabilize:
-       /*
-        * We now need to stablize the RMID we freed above (if any) to
-        * ensure that the next time we rotate we have an RMID with zero
-        * occupancy value.
-        *
-        * Alternatively, if we didn't need to perform any rotation,
-        * we'll have a bunch of RMIDs in limbo that need stabilizing.
-        */
-       threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-       while (intel_cqm_rmid_stabilize(&nr_available) &&
-              __intel_cqm_threshold < threshold_limit) {
-               unsigned int steal_limit;
-
-               /*
-                * Don't spin if nobody is actively waiting for an RMID,
-                * the rotation worker will be kicked as soon as an
-                * event needs an RMID anyway.
-                */
-               if (!nr_needed)
-                       break;
-
-               /* Allow max 25% of RMIDs to be in limbo. */
-               steal_limit = (cqm_max_rmid + 1) / 4;
-
-               /*
-                * We failed to stabilize any RMIDs so our rotation
-                * logic is now stuck. In order to make forward progress
-                * we have a few options:
-                *
-                *   1. rotate ("steal") another RMID
-                *   2. increase the threshold
-                *   3. do nothing
-                *
-                * We do both of 1. and 2. until we hit the steal limit.
-                *
-                * The steal limit prevents all RMIDs ending up on the
-                * limbo list. This can happen if every RMID has a
-                * non-zero occupancy above threshold_limit, and the
-                * occupancy values aren't dropping fast enough.
-                *
-                * Note that there is prioritisation at work here - we'd
-                * rather increase the number of RMIDs on the limbo list
-                * than increase the threshold, because increasing the
-                * threshold skews the event data (because we reuse
-                * dirty RMIDs) - threshold bumps are a last resort.
-                */
-               if (nr_available < steal_limit)
-                       goto again;
-
-               __intel_cqm_threshold++;
-       }
-
-out:
-       mutex_unlock(&cache_mutex);
-       return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-       unsigned long delay;
-
-       __intel_cqm_rmid_rotate();
-
-       delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-       schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-                                 struct perf_event **group)
-{
-       struct perf_event *iter;
-       bool conflict = false;
-       u32 rmid;
-
-       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-               rmid = iter->hw.cqm_rmid;
-
-               if (__match_event(iter, event)) {
-                       /* All tasks in a group share an RMID */
-                       event->hw.cqm_rmid = rmid;
-                       *group = iter;
-                       return;
-               }
-
-               /*
-                * We only care about conflicts for events that are
-                * actually scheduled in (and hence have a valid RMID).
-                */
-               if (__conflict_event(iter, event) && __rmid_valid(rmid))
-                       conflict = true;
-       }
-
-       if (conflict)
-               rmid = INVALID_RMID;
-       else
-               rmid = __get_rmid();
-
-       event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-       unsigned long flags;
-       u32 rmid;
-       u64 val;
-
-       /*
-        * Task events are handled by intel_cqm_event_count().
-        */
-       if (event->cpu == -1)
-               return;
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-       rmid = event->hw.cqm_rmid;
-
-       if (!__rmid_valid(rmid))
-               goto out;
-
-       val = __rmid_read(rmid);
-
-       /*
-        * Ignore this reading on error states and do not update the value.
-        */
-       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-               goto out;
-
-       local64_set(&event->count, val);
-out:
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-       struct rmid_read *rr = info;
-       u64 val;
-
-       val = __rmid_read(rr->rmid);
-
-       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-               return;
-
-       atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-       return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-       unsigned long flags;
-       struct rmid_read rr = {
-               .value = ATOMIC64_INIT(0),
-       };
-
-       /*
-        * We only need to worry about task events. System-wide events
-        * are handled like usual, i.e. entirely with
-        * intel_cqm_event_read().
-        */
-       if (event->cpu != -1)
-               return __perf_event_count(event);
-
-       /*
-        * Only the group leader gets to report values. This stops us
-        * reporting duplicate values to userspace, and gives us a clear
-        * rule for which task gets to report the values.
-        *
-        * Note that it is impossible to attribute these values to
-        * specific packages - we forfeit that ability when we create
-        * task events.
-        */
-       if (!cqm_group_leader(event))
-               return 0;
-
-       /*
-        * Getting up-to-date values requires an SMP IPI which is not
-        * possible if we're being called in interrupt context. Return
-        * the cached values instead.
-        */
-       if (unlikely(in_interrupt()))
-               goto out;
-
-       /*
-        * Notice that we don't perform the reading of an RMID
-        * atomically, because we can't hold a spin lock across the
-        * IPIs.
-        *
-        * Speculatively perform the read, since @event might be
-        * assigned a different (possibly invalid) RMID while we're
-        * busying performing the IPI calls. It's therefore necessary to
-        * check @event's RMID afterwards, and if it has changed,
-        * discard the result of the read.
-        */
-       rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-       if (!__rmid_valid(rr.rmid))
-               goto out;
-
-       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-       if (event->hw.cqm_rmid == rr.rmid)
-               local64_set(&event->count, atomic64_read(&rr.value));
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-       return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-       u32 rmid = event->hw.cqm_rmid;
-
-       if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-               return;
-
-       event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-       if (state->rmid_usecnt++) {
-               if (!WARN_ON_ONCE(state->rmid != rmid))
-                       return;
-       } else {
-               WARN_ON_ONCE(state->rmid);
-       }
-
-       state->rmid = rmid;
-       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-       if (event->hw.cqm_state & PERF_HES_STOPPED)
-               return;
-
-       event->hw.cqm_state |= PERF_HES_STOPPED;
-
-       intel_cqm_event_read(event);
-
-       if (!--state->rmid_usecnt) {
-               state->rmid = 0;
-               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-       } else {
-               WARN_ON_ONCE(!state->rmid);
-       }
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-       unsigned long flags;
-       u32 rmid;
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-
-       event->hw.cqm_state = PERF_HES_STOPPED;
-       rmid = event->hw.cqm_rmid;
-
-       if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-               intel_cqm_event_start(event, mode);
-
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-       return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-       struct perf_event *group_other = NULL;
-
-       mutex_lock(&cache_mutex);
-
-       /*
-        * If there's another event in this group...
-        */
-       if (!list_empty(&event->hw.cqm_group_entry)) {
-               group_other = list_first_entry(&event->hw.cqm_group_entry,
-                                              struct perf_event,
-                                              hw.cqm_group_entry);
-               list_del(&event->hw.cqm_group_entry);
-       }
-
-       /*
-        * And we're the group leader..
-        */
-       if (cqm_group_leader(event)) {
-               /*
-                * If there was a group_other, make that leader, otherwise
-                * destroy the group and return the RMID.
-                */
-               if (group_other) {
-                       list_replace(&event->hw.cqm_groups_entry,
-                                    &group_other->hw.cqm_groups_entry);
-               } else {
-                       u32 rmid = event->hw.cqm_rmid;
-
-                       if (__rmid_valid(rmid))
-                               __put_rmid(rmid);
-                       list_del(&event->hw.cqm_groups_entry);
-               }
-       }
-
-       mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-       struct perf_event *group = NULL;
-       bool rotate = false;
-
-       if (event->attr.type != intel_cqm_pmu.type)
-               return -ENOENT;
-
-       if (event->attr.config & ~QOS_EVENT_MASK)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-       INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-       event->destroy = intel_cqm_event_destroy;
-
-       mutex_lock(&cache_mutex);
-
-       /* Will also set rmid */
-       intel_cqm_setup_event(event, &group);
-
-       if (group) {
-               list_add_tail(&event->hw.cqm_group_entry,
-                             &group->hw.cqm_group_entry);
-       } else {
-               list_add_tail(&event->hw.cqm_groups_entry,
-                             &cache_groups);
-
-               /*
-                * All RMIDs are either in use or have recently been
-                * used. Kick the rotation worker to clean/free some.
-                *
-                * We only do this for the group leader, rather than for
-                * every event in a group to save on needless work.
-                */
-               if (!__rmid_valid(event->hw.cqm_rmid))
-                       rotate = true;
-       }
-
-       mutex_unlock(&cache_mutex);
-
-       if (rotate)
-               schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-       return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-static struct attribute *intel_cqm_events_attr[] = {
-       EVENT_PTR(intel_cqm_llc),
-       EVENT_PTR(intel_cqm_llc_pkg),
-       EVENT_PTR(intel_cqm_llc_unit),
-       EVENT_PTR(intel_cqm_llc_scale),
-       EVENT_PTR(intel_cqm_llc_snapshot),
-       NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-       .name = "events",
-       .attrs = intel_cqm_events_attr,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-       .name = "format",
-       .attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-                          char *page)
-{
-       ssize_t rv;
-
-       mutex_lock(&cache_mutex);
-       rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-       mutex_unlock(&cache_mutex);
-
-       return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-                           struct device_attribute *attr,
-                           const char *buf, size_t count)
-{
-       unsigned int bytes, cachelines;
-       int ret;
-
-       ret = kstrtouint(buf, 0, &bytes);
-       if (ret)
-               return ret;
-
-       mutex_lock(&cache_mutex);
-
-       __intel_cqm_max_threshold = bytes;
-       cachelines = bytes / cqm_l3_scale;
-
-       /*
-        * The new maximum takes effect immediately.
-        */
-       if (__intel_cqm_threshold > cachelines)
-               __intel_cqm_threshold = cachelines;
-
-       mutex_unlock(&cache_mutex);
-
-       return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-       &dev_attr_max_recycle_threshold.attr,
-       NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-       .attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-       &intel_cqm_events_group,
-       &intel_cqm_format_group,
-       &intel_cqm_group,
-       NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-       .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-       .attr_groups         = intel_cqm_attr_groups,
-       .task_ctx_nr         = perf_sw_context,
-       .event_init          = intel_cqm_event_init,
-       .add                 = intel_cqm_event_add,
-       .del                 = intel_cqm_event_stop,
-       .start               = intel_cqm_event_start,
-       .stop                = intel_cqm_event_stop,
-       .read                = intel_cqm_event_read,
-       .count               = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-       int phys_id = topology_physical_package_id(cpu);
-       int i;
-
-       for_each_cpu(i, &cqm_cpumask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return; /* already got reader for this socket */
-       }
-
-       cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static void intel_cqm_cpu_starting(unsigned int cpu)
-{
-       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-       state->rmid = 0;
-       state->closid = 0;
-       state->rmid_usecnt = 0;
-
-       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-}
-
-static void intel_cqm_cpu_exit(unsigned int cpu)
-{
-       int phys_id = topology_physical_package_id(cpu);
-       int i;
-
-       /*
-        * Is @cpu a designated cqm reader?
-        */
-       if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-               return;
-
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-
-               if (phys_id == topology_physical_package_id(i)) {
-                       cpumask_set_cpu(i, &cqm_cpumask);
-                       break;
-               }
-       }
-}
-
-static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu  = (unsigned long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               intel_cqm_cpu_exit(cpu);
-               break;
-       case CPU_STARTING:
-               intel_cqm_cpu_starting(cpu);
-               cqm_pick_event_reader(cpu);
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-       {}
-};
-
-static int __init intel_cqm_init(void)
-{
-       char *str, scale[20];
-       int i, cpu, ret;
-
-       if (!x86_match_cpu(intel_cqm_match))
-               return -ENODEV;
-
-       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-       /*
-        * It's possible that not all resources support the same number
-        * of RMIDs. Instead of making scheduling much more complicated
-        * (where we have to match a task's RMID to a cpu that supports
-        * that many RMIDs) just find the minimum RMIDs supported across
-        * all cpus.
-        *
-        * Also, check that the scales match on all cpus.
-        */
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-               if (c->x86_cache_max_rmid < cqm_max_rmid)
-                       cqm_max_rmid = c->x86_cache_max_rmid;
-
-               if (c->x86_cache_occ_scale != cqm_l3_scale) {
-                       pr_err("Multiple LLC scale values, disabling\n");
-                       ret = -EINVAL;
-                       goto out;
-               }
-       }
-
-       /*
-        * A reasonable upper limit on the max threshold is the number
-        * of lines tagged per RMID if all RMIDs have the same number of
-        * lines tagged in the LLC.
-        *
-        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-        */
-       __intel_cqm_max_threshold =
-               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-       str = kstrdup(scale, GFP_KERNEL);
-       if (!str) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       event_attr_intel_cqm_llc_scale.event_str = str;
-
-       ret = intel_cqm_setup_rmid_cache();
-       if (ret)
-               goto out;
-
-       for_each_online_cpu(i) {
-               intel_cqm_cpu_starting(i);
-               cqm_pick_event_reader(i);
-       }
-
-       __perf_cpu_notifier(intel_cqm_cpu_notifier);
-
-       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-       if (ret)
-               pr_err("Intel CQM perf registration failed: %d\n", ret);
-       else
-               pr_info("Intel CQM monitoring enabled\n");
-
-out:
-       cpu_notifier_register_done();
-
-       return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c

deleted file mode 100644 (file)

index 75a38b5..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * perf_event_intel_cstate.c: support cstate residency counters
- *
- * Copyright (C) 2015, Intel Corp.
- * Author: Kan Liang (kan.liang@intel.com)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- */
-
-/*
- * This file export cstate related free running (read-only) counters
- * for perf. These counters may be use simultaneously by other tools,
- * such as turbostat. However, it still make sense to implement them
- * in perf. Because we can conveniently collect them together with
- * other events, and allow to use them from tools without special MSR
- * access code.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it is not supported by the hardware.
- *
- * According to counters' scope and category, two PMUs are registered
- * with the perf_event core subsystem.
- *  - 'cstate_core': The counter is available for each physical core.
- *    The counters include CORE_C*_RESIDENCY.
- *  - 'cstate_pkg': The counter is available for each physical package.
- *    The counters include PKG_C*_RESIDENCY.
- *
- * All of these counters are specified in the Intel® 64 and IA-32
- * Architectures Software Developer.s Manual Vol3b.
- *
- * Model specific counters:
- *     MSR_CORE_C1_RES: CORE C1 Residency Counter
- *                      perf code: 0x00
- *                      Available model: SLM,AMT
- *                      Scope: Core (each processor core has a MSR)
- *     MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
- *                            perf code: 0x01
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
- *                            perf code: 0x02
- *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
- *                            perf code: 0x03
- *                            Available model: SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
- *                            perf code: 0x00
- *                            Available model: SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
- *                            perf code: 0x01
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
- *                            perf code: 0x02
- *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
- *                            perf code: 0x03
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
- *                            perf code: 0x04
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *     MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
- *                            perf code: 0x05
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *     MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
- *                            perf code: 0x06
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)                \
-static ssize_t __cstate_##_var##_show(struct kobject *kobj,    \
-                               struct kobj_attribute *attr,    \
-                               char *page)                     \
-{                                                              \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
-       return sprintf(page, _format "\n");                     \
-}                                                              \
-static struct kobj_attribute format_attr_##_var =              \
-       __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-                                      struct device_attribute *attr,
-                                      char *buf);
-
-struct perf_cstate_msr {
-       u64     msr;
-       struct  perf_pmu_events_attr *attr;
-       bool    (*test)(int idx);
-};
-
-
-/* cstate_core PMU */
-
-static struct pmu cstate_core_pmu;
-static bool has_cstate_core;
-
-enum perf_cstate_core_id {
-       /*
-        * cstate_core events
-        */
-       PERF_CSTATE_CORE_C1_RES = 0,
-       PERF_CSTATE_CORE_C3_RES,
-       PERF_CSTATE_CORE_C6_RES,
-       PERF_CSTATE_CORE_C7_RES,
-
-       PERF_CSTATE_CORE_EVENT_MAX,
-};
-
-bool test_core(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES ||
-                   idx == PERF_CSTATE_CORE_C7_RES)
-                       return true;
-               break;
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_CSTATE_CORE_C1_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
-
-static struct perf_cstate_msr core_msr[] = {
-       [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,          &evattr_cstate_core_c1, test_core, },
-       [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,    &evattr_cstate_core_c3, test_core, },
-       [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,    &evattr_cstate_core_c6, test_core, },
-       [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,    &evattr_cstate_core_c7, test_core, },
-};
-
-static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group core_events_attr_group = {
-       .name = "events",
-       .attrs = core_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
-       &format_attr_core_event.attr,
-       NULL,
-};
-
-static struct attribute_group core_format_attr_group = {
-       .name = "format",
-       .attrs = core_format_attrs,
-};
-
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
-       .attrs = cstate_cpumask_attrs,
-};
-
-static const struct attribute_group *core_attr_groups[] = {
-       &core_events_attr_group,
-       &core_format_attr_group,
-       &cpumask_attr_group,
-       NULL,
-};
-
-/* cstate_core PMU end */
-
-
-/* cstate_pkg PMU */
-
-static struct pmu cstate_pkg_pmu;
-static bool has_cstate_pkg;
-
-enum perf_cstate_pkg_id {
-       /*
-        * cstate_pkg events
-        */
-       PERF_CSTATE_PKG_C2_RES = 0,
-       PERF_CSTATE_PKG_C3_RES,
-       PERF_CSTATE_PKG_C6_RES,
-       PERF_CSTATE_PKG_C7_RES,
-       PERF_CSTATE_PKG_C8_RES,
-       PERF_CSTATE_PKG_C9_RES,
-       PERF_CSTATE_PKG_C10_RES,
-
-       PERF_CSTATE_PKG_EVENT_MAX,
-};
-
-bool test_pkg(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES ||
-                   idx == PERF_CSTATE_CORE_C7_RES)
-                       return true;
-               break;
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_CSTATE_PKG_C2_RES ||
-                   idx == PERF_CSTATE_PKG_C3_RES ||
-                   idx == PERF_CSTATE_PKG_C6_RES ||
-                   idx == PERF_CSTATE_PKG_C7_RES)
-                       return true;
-               break;
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       case 69: /* 22nm Haswell ULT */
-               if (idx == PERF_CSTATE_PKG_C2_RES ||
-                   idx == PERF_CSTATE_PKG_C3_RES ||
-                   idx == PERF_CSTATE_PKG_C6_RES ||
-                   idx == PERF_CSTATE_PKG_C7_RES ||
-                   idx == PERF_CSTATE_PKG_C8_RES ||
-                   idx == PERF_CSTATE_PKG_C9_RES ||
-                   idx == PERF_CSTATE_PKG_C10_RES)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
-PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
-PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
-PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
-
-static struct perf_cstate_msr pkg_msr[] = {
-       [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,      &evattr_cstate_pkg_c2,  test_pkg, },
-       [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,      &evattr_cstate_pkg_c3,  test_pkg, },
-       [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,      &evattr_cstate_pkg_c6,  test_pkg, },
-       [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,      &evattr_cstate_pkg_c7,  test_pkg, },
-       [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,      &evattr_cstate_pkg_c8,  test_pkg, },
-       [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,      &evattr_cstate_pkg_c9,  test_pkg, },
-       [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,    &evattr_cstate_pkg_c10, test_pkg, },
-};
-
-static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group pkg_events_attr_group = {
-       .name = "events",
-       .attrs = pkg_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
-       &format_attr_pkg_event.attr,
-       NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
-       .name = "format",
-       .attrs = pkg_format_attrs,
-};
-
-static cpumask_t cstate_pkg_cpu_mask;
-
-static const struct attribute_group *pkg_attr_groups[] = {
-       &pkg_events_attr_group,
-       &pkg_format_attr_group,
-       &cpumask_attr_group,
-       NULL,
-};
-
-/* cstate_pkg PMU end*/
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-                                      struct device_attribute *attr,
-                                      char *buf)
-{
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       if (pmu == &cstate_core_pmu)
-               return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
-       else if (pmu == &cstate_pkg_pmu)
-               return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
-       else
-               return 0;
-}
-
-static int cstate_pmu_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config;
-       int ret = 0;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       if (event->pmu == &cstate_core_pmu) {
-               if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
-                       return -EINVAL;
-               if (!core_msr[cfg].attr)
-                       return -EINVAL;
-               event->hw.event_base = core_msr[cfg].msr;
-       } else if (event->pmu == &cstate_pkg_pmu) {
-               if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
-                       return -EINVAL;
-               if (!pkg_msr[cfg].attr)
-                       return -EINVAL;
-               event->hw.event_base = pkg_msr[cfg].msr;
-       } else
-               return -ENOENT;
-
-       /* must be done before validate_group */
-       event->hw.config = cfg;
-       event->hw.idx = -1;
-
-       return ret;
-}
-
-static inline u64 cstate_pmu_read_counter(struct perf_event *event)
-{
-       u64 val;
-
-       rdmsrl(event->hw.event_base, val);
-       return val;
-}
-
-static void cstate_pmu_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev_raw_count, new_raw_count;
-
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       new_raw_count = cstate_pmu_read_counter(event);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                           new_raw_count) != prev_raw_count)
-               goto again;
-
-       local64_add(new_raw_count - prev_raw_count, &event->count);
-}
-
-static void cstate_pmu_event_start(struct perf_event *event, int mode)
-{
-       local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
-}
-
-static void cstate_pmu_event_stop(struct perf_event *event, int mode)
-{
-       cstate_pmu_event_update(event);
-}
-
-static void cstate_pmu_event_del(struct perf_event *event, int mode)
-{
-       cstate_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int cstate_pmu_event_add(struct perf_event *event, int mode)
-{
-       if (mode & PERF_EF_START)
-               cstate_pmu_event_start(event, mode);
-
-       return 0;
-}
-
-static void cstate_cpu_exit(int cpu)
-{
-       int i, id, target;
-
-       /* cpu exit for cstate core */
-       if (has_cstate_core) {
-               id = topology_core_id(cpu);
-               target = -1;
-
-               for_each_online_cpu(i) {
-                       if (i == cpu)
-                               continue;
-                       if (id == topology_core_id(i)) {
-                               target = i;
-                               break;
-                       }
-               }
-               if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
-                       cpumask_set_cpu(target, &cstate_core_cpu_mask);
-               WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
-               if (target >= 0)
-                       perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
-       }
-
-       /* cpu exit for cstate pkg */
-       if (has_cstate_pkg) {
-               id = topology_physical_package_id(cpu);
-               target = -1;
-
-               for_each_online_cpu(i) {
-                       if (i == cpu)
-                               continue;
-                       if (id == topology_physical_package_id(i)) {
-                               target = i;
-                               break;
-                       }
-               }
-               if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
-                       cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
-               WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
-               if (target >= 0)
-                       perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
-       }
-}
-
-static void cstate_cpu_init(int cpu)
-{
-       int i, id;
-
-       /* cpu init for cstate core */
-       if (has_cstate_core) {
-               id = topology_core_id(cpu);
-               for_each_cpu(i, &cstate_core_cpu_mask) {
-                       if (id == topology_core_id(i))
-                               break;
-               }
-               if (i >= nr_cpu_ids)
-                       cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-       }
-
-       /* cpu init for cstate pkg */
-       if (has_cstate_pkg) {
-               id = topology_physical_package_id(cpu);
-               for_each_cpu(i, &cstate_pkg_cpu_mask) {
-                       if (id == topology_physical_package_id(i))
-                               break;
-               }
-               if (i >= nr_cpu_ids)
-                       cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-       }
-}
-
-static int cstate_cpu_notifier(struct notifier_block *self,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               break;
-       case CPU_STARTING:
-               cstate_cpu_init(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               break;
-       case CPU_DOWN_PREPARE:
-               cstate_cpu_exit(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
-                            struct attribute   **events_attrs,
-                            int max_event_nr)
-{
-       int i, j = 0;
-       u64 val;
-
-       /* Probe the cstate events. */
-       for (i = 0; i < max_event_nr; i++) {
-               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-                       msr[i].attr = NULL;
-       }
-
-       /* List remaining events in the sysfs attrs. */
-       for (i = 0; i < max_event_nr; i++) {
-               if (msr[i].attr)
-                       events_attrs[j++] = &msr[i].attr->attr.attr;
-       }
-       events_attrs[j] = NULL;
-
-       return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
-       /* SLM has different MSR for PKG C6 */
-       switch (boot_cpu_data.x86_model) {
-       case 55:
-       case 76:
-       case 77:
-               pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
-       }
-
-       if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
-               has_cstate_core = true;
-
-       if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
-               has_cstate_pkg = true;
-
-       return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
-       int cpu;
-
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu)
-               cstate_cpu_init(cpu);
-
-       __perf_cpu_notifier(cstate_cpu_notifier);
-
-       cpu_notifier_register_done();
-}
-
-static struct pmu cstate_core_pmu = {
-       .attr_groups    = core_attr_groups,
-       .name           = "cstate_core",
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = cstate_pmu_event_init,
-       .add            = cstate_pmu_event_add, /* must have */
-       .del            = cstate_pmu_event_del, /* must have */
-       .start          = cstate_pmu_event_start,
-       .stop           = cstate_pmu_event_stop,
-       .read           = cstate_pmu_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static struct pmu cstate_pkg_pmu = {
-       .attr_groups    = pkg_attr_groups,
-       .name           = "cstate_pkg",
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = cstate_pmu_event_init,
-       .add            = cstate_pmu_event_add, /* must have */
-       .del            = cstate_pmu_event_del, /* must have */
-       .start          = cstate_pmu_event_start,
-       .stop           = cstate_pmu_event_stop,
-       .read           = cstate_pmu_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static void __init cstate_pmus_register(void)
-{
-       int err;
-
-       if (has_cstate_core) {
-               err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
-               if (WARN_ON(err))
-                       pr_info("Failed to register PMU %s error %d\n",
-                               cstate_core_pmu.name, err);
-       }
-
-       if (has_cstate_pkg) {
-               err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
-               if (WARN_ON(err))
-                       pr_info("Failed to register PMU %s error %d\n",
-                               cstate_pkg_pmu.name, err);
-       }
-}
-
-static int __init cstate_pmu_init(void)
-{
-       int err;
-
-       if (cpu_has_hypervisor)
-               return -ENODEV;
-
-       err = cstate_init();
-       if (err)
-               return err;
-
-       cstate_cpumask_init();
-
-       cstate_pmus_register();
-
-       return 0;
-}
-
-device_initcall(cstate_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

deleted file mode 100644 (file)

index 10602f0..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ /dev/null
@@ -1,1368 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE                24
-
-#define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
-#define PEBS_FIXUP_SIZE                PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
-       u32 flags, ip;
-       u32 ax, bc, cx, dx;
-       u32 si, di, bp, sp;
-};
-
- */
-
-union intel_x86_pebs_dse {
-       u64 val;
-       struct {
-               unsigned int ld_dse:4;
-               unsigned int ld_stlb_miss:1;
-               unsigned int ld_locked:1;
-               unsigned int ld_reserved:26;
-       };
-       struct {
-               unsigned int st_l1d_hit:1;
-               unsigned int st_reserved1:3;
-               unsigned int st_stlb_miss:1;
-               unsigned int st_locked:1;
-               unsigned int st_reserved2:26;
-       };
-};
-
-
-/*
- * Map PEBS Load Latency Data Source encodings to generic
- * memory data source information
- */
-#define P(a, b) PERF_MEM_S(a, b)
-#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
-#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-
-static const u64 pebs_data_source[] = {
-       P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-       OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
-       OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
-       OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
-       OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
-       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-       OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-       OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-       OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-       OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-       OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-       OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
-};
-
-static u64 precise_store_data(u64 status)
-{
-       union intel_x86_pebs_dse dse;
-       u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
-
-       dse.val = status;
-
-       /*
-        * bit 4: TLB access
-        * 1 = stored missed 2nd level TLB
-        *
-        * so it either hit the walker or the OS
-        * otherwise hit 2nd level TLB
-        */
-       if (dse.st_stlb_miss)
-               val |= P(TLB, MISS);
-       else
-               val |= P(TLB, HIT);
-
-       /*
-        * bit 0: hit L1 data cache
-        * if not set, then all we know is that
-        * it missed L1D
-        */
-       if (dse.st_l1d_hit)
-               val |= P(LVL, HIT);
-       else
-               val |= P(LVL, MISS);
-
-       /*
-        * bit 5: Locked prefix
-        */
-       if (dse.st_locked)
-               val |= P(LOCK, LOCKED);
-
-       return val;
-}
-
-static u64 precise_datala_hsw(struct perf_event *event, u64 status)
-{
-       union perf_mem_data_src dse;
-
-       dse.val = PERF_MEM_NA;
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
-               dse.mem_op = PERF_MEM_OP_STORE;
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
-               dse.mem_op = PERF_MEM_OP_LOAD;
-
-       /*
-        * L1 info only valid for following events:
-        *
-        * MEM_UOPS_RETIRED.STLB_MISS_STORES
-        * MEM_UOPS_RETIRED.LOCK_STORES
-        * MEM_UOPS_RETIRED.SPLIT_STORES
-        * MEM_UOPS_RETIRED.ALL_STORES
-        */
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
-               if (status & 1)
-                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
-               else
-                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-       }
-       return dse.val;
-}
-
-static u64 load_latency_data(u64 status)
-{
-       union intel_x86_pebs_dse dse;
-       u64 val;
-       int model = boot_cpu_data.x86_model;
-       int fam = boot_cpu_data.x86;
-
-       dse.val = status;
-
-       /*
-        * use the mapping table for bit 0-3
-        */
-       val = pebs_data_source[dse.ld_dse];
-
-       /*
-        * Nehalem models do not support TLB, Lock infos
-        */
-       if (fam == 0x6 && (model == 26 || model == 30
-           || model == 31 || model == 46)) {
-               val |= P(TLB, NA) | P(LOCK, NA);
-               return val;
-       }
-       /*
-        * bit 4: TLB access
-        * 0 = did not miss 2nd level TLB
-        * 1 = missed 2nd level TLB
-        */
-       if (dse.ld_stlb_miss)
-               val |= P(TLB, MISS) | P(TLB, L2);
-       else
-               val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
-       /*
-        * bit 5: locked prefix
-        */
-       if (dse.ld_locked)
-               val |= P(LOCK, LOCKED);
-
-       return val;
-}
-
-struct pebs_record_core {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-};
-
-/*
- * Same as pebs_record_nhm, with two additional fields.
- */
-struct pebs_record_hsw {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-       u64 real_ip, tsx_tuning;
-};
-
-union hsw_tsx_tuning {
-       struct {
-               u32 cycles_last_block     : 32,
-                   hle_abort             : 1,
-                   rtm_abort             : 1,
-                   instruction_abort     : 1,
-                   non_instruction_abort : 1,
-                   retry                 : 1,
-                   data_conflict         : 1,
-                   capacity_writes       : 1,
-                   capacity_reads        : 1;
-       };
-       u64         value;
-};
-
-#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
-
-/* Same as HSW, plus TSC */
-
-struct pebs_record_skl {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-       u64 real_ip, tsx_tuning;
-       u64 tsc;
-};
-
-void init_debug_store_on_cpu(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-                    (u32)((u64)(unsigned long)ds),
-                    (u32)((u64)(unsigned long)ds >> 32));
-}
-
-void fini_debug_store_on_cpu(int cpu)
-{
-       if (!per_cpu(cpu_hw_events, cpu).ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static DEFINE_PER_CPU(void *, insn_buffer);
-
-static int alloc_pebs_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       int node = cpu_to_node(cpu);
-       int max;
-       void *buffer, *ibuffer;
-
-       if (!x86_pmu.pebs)
-               return 0;
-
-       buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
-       if (unlikely(!buffer))
-               return -ENOMEM;
-
-       /*
-        * HSW+ already provides us the eventing ip; no need to allocate this
-        * buffer then.
-        */
-       if (x86_pmu.intel_cap.pebs_format < 2) {
-               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
-               if (!ibuffer) {
-                       kfree(buffer);
-                       return -ENOMEM;
-               }
-               per_cpu(insn_buffer, cpu) = ibuffer;
-       }
-
-       max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
-       ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-       ds->pebs_index = ds->pebs_buffer_base;
-       ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-               max * x86_pmu.pebs_record_size;
-
-       return 0;
-}
-
-static void release_pebs_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds || !x86_pmu.pebs)
-               return;
-
-       kfree(per_cpu(insn_buffer, cpu));
-       per_cpu(insn_buffer, cpu) = NULL;
-
-       kfree((void *)(unsigned long)ds->pebs_buffer_base);
-       ds->pebs_buffer_base = 0;
-}
-
-static int alloc_bts_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       int node = cpu_to_node(cpu);
-       int max, thresh;
-       void *buffer;
-
-       if (!x86_pmu.bts)
-               return 0;
-
-       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
-       if (unlikely(!buffer)) {
-               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
-               return -ENOMEM;
-       }
-
-       max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-       thresh = max / 16;
-
-       ds->bts_buffer_base = (u64)(unsigned long)buffer;
-       ds->bts_index = ds->bts_buffer_base;
-       ds->bts_absolute_maximum = ds->bts_buffer_base +
-               max * BTS_RECORD_SIZE;
-       ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-               thresh * BTS_RECORD_SIZE;
-
-       return 0;
-}
-
-static void release_bts_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds || !x86_pmu.bts)
-               return;
-
-       kfree((void *)(unsigned long)ds->bts_buffer_base);
-       ds->bts_buffer_base = 0;
-}
-
-static int alloc_ds_buffer(int cpu)
-{
-       int node = cpu_to_node(cpu);
-       struct debug_store *ds;
-
-       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-       if (unlikely(!ds))
-               return -ENOMEM;
-
-       per_cpu(cpu_hw_events, cpu).ds = ds;
-
-       return 0;
-}
-
-static void release_ds_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds)
-               return;
-
-       per_cpu(cpu_hw_events, cpu).ds = NULL;
-       kfree(ds);
-}
-
-void release_ds_buffers(void)
-{
-       int cpu;
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu)
-               fini_debug_store_on_cpu(cpu);
-
-       for_each_possible_cpu(cpu) {
-               release_pebs_buffer(cpu);
-               release_bts_buffer(cpu);
-               release_ds_buffer(cpu);
-       }
-       put_online_cpus();
-}
-
-void reserve_ds_buffers(void)
-{
-       int bts_err = 0, pebs_err = 0;
-       int cpu;
-
-       x86_pmu.bts_active = 0;
-       x86_pmu.pebs_active = 0;
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       if (!x86_pmu.bts)
-               bts_err = 1;
-
-       if (!x86_pmu.pebs)
-               pebs_err = 1;
-
-       get_online_cpus();
-
-       for_each_possible_cpu(cpu) {
-               if (alloc_ds_buffer(cpu)) {
-                       bts_err = 1;
-                       pebs_err = 1;
-               }
-
-               if (!bts_err && alloc_bts_buffer(cpu))
-                       bts_err = 1;
-
-               if (!pebs_err && alloc_pebs_buffer(cpu))
-                       pebs_err = 1;
-
-               if (bts_err && pebs_err)
-                       break;
-       }
-
-       if (bts_err) {
-               for_each_possible_cpu(cpu)
-                       release_bts_buffer(cpu);
-       }
-
-       if (pebs_err) {
-               for_each_possible_cpu(cpu)
-                       release_pebs_buffer(cpu);
-       }
-
-       if (bts_err && pebs_err) {
-               for_each_possible_cpu(cpu)
-                       release_ds_buffer(cpu);
-       } else {
-               if (x86_pmu.bts && !bts_err)
-                       x86_pmu.bts_active = 1;
-
-               if (x86_pmu.pebs && !pebs_err)
-                       x86_pmu.pebs_active = 1;
-
-               for_each_online_cpu(cpu)
-                       init_debug_store_on_cpu(cpu);
-       }
-
-       put_online_cpus();
-}
-
-/*
- * BTS
- */
-
-struct event_constraint bts_constraint =
-       EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
-
-void intel_pmu_enable_bts(u64 config)
-{
-       unsigned long debugctlmsr;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr |= DEBUGCTLMSR_TR;
-       debugctlmsr |= DEBUGCTLMSR_BTS;
-       if (config & ARCH_PERFMON_EVENTSEL_INT)
-               debugctlmsr |= DEBUGCTLMSR_BTINT;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-void intel_pmu_disable_bts(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       unsigned long debugctlmsr;
-
-       if (!cpuc->ds)
-               return;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr &=
-               ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
-                 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-int intel_pmu_drain_bts_buffer(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct bts_record {
-               u64     from;
-               u64     to;
-               u64     flags;
-       };
-       struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-       struct bts_record *at, *base, *top;
-       struct perf_output_handle handle;
-       struct perf_event_header header;
-       struct perf_sample_data data;
-       unsigned long skip = 0;
-       struct pt_regs regs;
-
-       if (!event)
-               return 0;
-
-       if (!x86_pmu.bts_active)
-               return 0;
-
-       base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-       top  = (struct bts_record *)(unsigned long)ds->bts_index;
-
-       if (top <= base)
-               return 0;
-
-       memset(&regs, 0, sizeof(regs));
-
-       ds->bts_index = ds->bts_buffer_base;
-
-       perf_sample_data_init(&data, 0, event->hw.last_period);
-
-       /*
-        * BTS leaks kernel addresses in branches across the cpl boundary,
-        * such as traps or system calls, so unless the user is asking for
-        * kernel tracing (and right now it's not possible), we'd need to
-        * filter them out. But first we need to count how many of those we
-        * have in the current batch. This is an extra O(n) pass, however,
-        * it's much faster than the other one especially considering that
-        * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
-        * alloc_bts_buffer()).
-        */
-       for (at = base; at < top; at++) {
-               /*
-                * Note that right now *this* BTS code only works if
-                * attr::exclude_kernel is set, but let's keep this extra
-                * check here in case that changes.
-                */
-               if (event->attr.exclude_kernel &&
-                   (kernel_ip(at->from) || kernel_ip(at->to)))
-                       skip++;
-       }
-
-       /*
-        * Prepare a generic sample, i.e. fill in the invariant fields.
-        * We will overwrite the from and to address before we output
-        * the sample.
-        */
-       perf_prepare_sample(&header, &data, event, &regs);
-
-       if (perf_output_begin(&handle, event, header.size *
-                             (top - base - skip)))
-               return 1;
-
-       for (at = base; at < top; at++) {
-               /* Filter out any records that contain kernel addresses. */
-               if (event->attr.exclude_kernel &&
-                   (kernel_ip(at->from) || kernel_ip(at->to)))
-                       continue;
-
-               data.ip         = at->from;
-               data.addr       = at->to;
-
-               perf_output_sample(&handle, &header, &data, event);
-       }
-
-       perf_output_end(&handle);
-
-       /* There's new data available. */
-       event->hw.interrupts++;
-       event->pending_kill = POLL_IN;
-       return 1;
-}
-
-static inline void intel_pmu_drain_pebs_buffer(void)
-{
-       struct pt_regs regs;
-
-       x86_pmu.drain_pebs(&regs);
-}
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       if (!sched_in)
-               intel_pmu_drain_pebs_buffer();
-}
-
-/*
- * PEBS
- */
-struct event_constraint intel_core2_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_atom_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_slm_pebs_event_constraints[] = {
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_nehalem_pebs_event_constraints[] = {
-       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
-       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_westmere_pebs_event_constraints[] = {
-       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
-       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_snb_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-       INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_ivb_pebs_event_constraints[] = {
-        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-        EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_hsw_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       if (!event->attr.precise_ip)
-               return NULL;
-
-       if (x86_pmu.pebs_constraints) {
-               for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-                       if ((event->hw.config & c->cmask) == c->code) {
-                               event->hw.flags |= c->flags;
-                               return c;
-                       }
-               }
-       }
-
-       return &emptyconstraint;
-}
-
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
-{
-       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
-}
-
-void intel_pmu_pebs_enable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool first_pebs;
-       u64 threshold;
-
-       hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
-       first_pebs = !pebs_is_enabled(cpuc);
-       cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-               cpuc->pebs_enabled |= 1ULL << 63;
-
-       /*
-        * When the event is constrained enough we can use a larger
-        * threshold and run the event with less frequent PMI.
-        */
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-               threshold = ds->pebs_absolute_maximum -
-                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-               if (first_pebs)
-                       perf_sched_cb_inc(event->ctx->pmu);
-       } else {
-               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-               /*
-                * If not all events can use larger buffer,
-                * roll back to threshold = 1
-                */
-               if (!first_pebs &&
-                   (ds->pebs_interrupt_threshold > threshold))
-                       perf_sched_cb_dec(event->ctx->pmu);
-       }
-
-       /* Use auto-reload if possible to save a MSR write in the PMI */
-       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-               ds->pebs_event_reset[hwc->idx] =
-                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
-       }
-
-       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-               ds->pebs_interrupt_threshold = threshold;
-}
-
-void intel_pmu_pebs_disable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool large_pebs = ds->pebs_interrupt_threshold >
-               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-       if (large_pebs)
-               intel_pmu_drain_pebs_buffer();
-
-       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-               cpuc->pebs_enabled &= ~(1ULL << 63);
-
-       if (large_pebs && !pebs_is_enabled(cpuc))
-               perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
-       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-}
-
-void intel_pmu_pebs_enable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->pebs_enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-void intel_pmu_pebs_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->pebs_enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       unsigned long from = cpuc->lbr_entries[0].from;
-       unsigned long old_to, to = cpuc->lbr_entries[0].to;
-       unsigned long ip = regs->ip;
-       int is_64bit = 0;
-       void *kaddr;
-       int size;
-
-       /*
-        * We don't need to fixup if the PEBS assist is fault like
-        */
-       if (!x86_pmu.intel_cap.pebs_trap)
-               return 1;
-
-       /*
-        * No LBR entry, no basic block, no rewinding
-        */
-       if (!cpuc->lbr_stack.nr || !from || !to)
-               return 0;
-
-       /*
-        * Basic blocks should never cross user/kernel boundaries
-        */
-       if (kernel_ip(ip) != kernel_ip(to))
-               return 0;
-
-       /*
-        * unsigned math, either ip is before the start (impossible) or
-        * the basic block is larger than 1 page (sanity)
-        */
-       if ((ip - to) > PEBS_FIXUP_SIZE)
-               return 0;
-
-       /*
-        * We sampled a branch insn, rewind using the LBR stack
-        */
-       if (ip == to) {
-               set_linear_ip(regs, from);
-               return 1;
-       }
-
-       size = ip - to;
-       if (!kernel_ip(ip)) {
-               int bytes;
-               u8 *buf = this_cpu_read(insn_buffer);
-
-               /* 'size' must fit our buffer, see above */
-               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-               if (bytes != 0)
-                       return 0;
-
-               kaddr = buf;
-       } else {
-               kaddr = (void *)to;
-       }
-
-       do {
-               struct insn insn;
-
-               old_to = to;
-
-#ifdef CONFIG_X86_64
-               is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
-#endif
-               insn_init(&insn, kaddr, size, is_64bit);
-               insn_get_length(&insn);
-               /*
-                * Make sure there was not a problem decoding the
-                * instruction and getting the length.  This is
-                * doubly important because we have an infinite
-                * loop if insn.length=0.
-                */
-               if (!insn.length)
-                       break;
-
-               to += insn.length;
-               kaddr += insn.length;
-               size -= insn.length;
-       } while (to < ip);
-
-       if (to == ip) {
-               set_linear_ip(regs, old_to);
-               return 1;
-       }
-
-       /*
-        * Even though we decoded the basic block, the instruction stream
-        * never matched the given IP, either the TO or the IP got corrupted.
-        */
-       return 0;
-}
-
-static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
-{
-       if (pebs->tsx_tuning) {
-               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
-               return tsx.cycles_last_block;
-       }
-       return 0;
-}
-
-static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
-{
-       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
-
-       /* For RTM XABORTs also log the abort code from AX */
-       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
-               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
-       return txn;
-}
-
-static void setup_pebs_sample_data(struct perf_event *event,
-                                  struct pt_regs *iregs, void *__pebs,
-                                  struct perf_sample_data *data,
-                                  struct pt_regs *regs)
-{
-#define PERF_X86_EVENT_PEBS_HSW_PREC \
-               (PERF_X86_EVENT_PEBS_ST_HSW | \
-                PERF_X86_EVENT_PEBS_LD_HSW | \
-                PERF_X86_EVENT_PEBS_NA_HSW)
-       /*
-        * We cast to the biggest pebs_record but are careful not to
-        * unconditionally access the 'extra' entries.
-        */
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct pebs_record_skl *pebs = __pebs;
-       u64 sample_type;
-       int fll, fst, dsrc;
-       int fl = event->hw.flags;
-
-       if (pebs == NULL)
-               return;
-
-       sample_type = event->attr.sample_type;
-       dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
-
-       fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
-       fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
-
-       perf_sample_data_init(data, 0, event->hw.last_period);
-
-       data->period = event->hw.last_period;
-
-       /*
-        * Use latency for weight (only avail with PEBS-LL)
-        */
-       if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-               data->weight = pebs->lat;
-
-       /*
-        * data.data_src encodes the data source
-        */
-       if (dsrc) {
-               u64 val = PERF_MEM_NA;
-               if (fll)
-                       val = load_latency_data(pebs->dse);
-               else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
-                       val = precise_datala_hsw(event, pebs->dse);
-               else if (fst)
-                       val = precise_store_data(pebs->dse);
-               data->data_src.val = val;
-       }
-
-       /*
-        * We use the interrupt regs as a base because the PEBS record
-        * does not contain a full regs set, specifically it seems to
-        * lack segment descriptors, which get used by things like
-        * user_mode().
-        *
-        * In the simple case fix up only the IP and BP,SP regs, for
-        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-        */
-       *regs = *iregs;
-       regs->flags = pebs->flags;
-       set_linear_ip(regs, pebs->ip);
-       regs->bp = pebs->bp;
-       regs->sp = pebs->sp;
-
-       if (sample_type & PERF_SAMPLE_REGS_INTR) {
-               regs->ax = pebs->ax;
-               regs->bx = pebs->bx;
-               regs->cx = pebs->cx;
-               regs->dx = pebs->dx;
-               regs->si = pebs->si;
-               regs->di = pebs->di;
-               regs->bp = pebs->bp;
-               regs->sp = pebs->sp;
-
-               regs->flags = pebs->flags;
-#ifndef CONFIG_X86_32
-               regs->r8 = pebs->r8;
-               regs->r9 = pebs->r9;
-               regs->r10 = pebs->r10;
-               regs->r11 = pebs->r11;
-               regs->r12 = pebs->r12;
-               regs->r13 = pebs->r13;
-               regs->r14 = pebs->r14;
-               regs->r15 = pebs->r15;
-#endif
-       }
-
-       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-               regs->ip = pebs->real_ip;
-               regs->flags |= PERF_EFLAGS_EXACT;
-       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
-               regs->flags |= PERF_EFLAGS_EXACT;
-       else
-               regs->flags &= ~PERF_EFLAGS_EXACT;
-
-       if ((sample_type & PERF_SAMPLE_ADDR) &&
-           x86_pmu.intel_cap.pebs_format >= 1)
-               data->addr = pebs->dla;
-
-       if (x86_pmu.intel_cap.pebs_format >= 2) {
-               /* Only set the TSX weight when no memory weight. */
-               if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-                       data->weight = intel_hsw_weight(pebs);
-
-               if (sample_type & PERF_SAMPLE_TRANSACTION)
-                       data->txn = intel_hsw_transaction(pebs);
-       }
-
-       /*
-        * v3 supplies an accurate time stamp, so we use that
-        * for the time stamp.
-        *
-        * We can only do this for the default trace clock.
-        */
-       if (x86_pmu.intel_cap.pebs_format >= 3 &&
-               event->attr.use_clockid == 0)
-               data->time = native_sched_clock_from_tsc(pebs->tsc);
-
-       if (has_branch_stack(event))
-               data->br_stack = &cpuc->lbr_stack;
-}
-
-static inline void *
-get_next_pebs_record_by_bit(void *base, void *top, int bit)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       void *at;
-       u64 pebs_status;
-
-       /*
-        * fmt0 does not have a status bitfield (does not use
-        * perf_record_nhm format)
-        */
-       if (x86_pmu.intel_cap.pebs_format < 1)
-               return base;
-
-       if (base == NULL)
-               return NULL;
-
-       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-               struct pebs_record_nhm *p = at;
-
-               if (test_bit(bit, (unsigned long *)&p->status)) {
-                       /* PEBS v3 has accurate status bits */
-                       if (x86_pmu.intel_cap.pebs_format >= 3)
-                               return at;
-
-                       if (p->status == (1 << bit))
-                               return at;
-
-                       /* clear non-PEBS bit and re-check */
-                       pebs_status = p->status & cpuc->pebs_enabled;
-                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-                       if (pebs_status == (1 << bit))
-                               return at;
-               }
-       }
-       return NULL;
-}
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
-                                  struct pt_regs *iregs,
-                                  void *base, void *top,
-                                  int bit, int count)
-{
-       struct perf_sample_data data;
-       struct pt_regs regs;
-       void *at = get_next_pebs_record_by_bit(base, top, bit);
-
-       if (!intel_pmu_save_and_restart(event) &&
-           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
-               return;
-
-       while (count > 1) {
-               setup_pebs_sample_data(event, iregs, at, &data, &regs);
-               perf_event_output(event, &data, &regs);
-               at += x86_pmu.pebs_record_size;
-               at = get_next_pebs_record_by_bit(at, top, bit);
-               count--;
-       }
-
-       setup_pebs_sample_data(event, iregs, at, &data, &regs);
-
-       /*
-        * All but the last records are processed.
-        * The last one is left to be able to call the overflow handler.
-        */
-       if (perf_event_overflow(event, &data, &regs)) {
-               x86_pmu_stop(event, 0);
-               return;
-       }
-
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct perf_event *event = cpuc->events[0]; /* PMC0 only */
-       struct pebs_record_core *at, *top;
-       int n;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
-       /*
-        * Whatever else happens, drain the thing
-        */
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       if (!test_bit(0, cpuc->active_mask))
-               return;
-
-       WARN_ON_ONCE(!event);
-
-       if (!event->attr.precise_ip)
-               return;
-
-       n = top - at;
-       if (n <= 0)
-               return;
-
-       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct perf_event *event;
-       void *base, *at, *top;
-       short counts[MAX_PEBS_EVENTS] = {};
-       short error[MAX_PEBS_EVENTS] = {};
-       int bit, i;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       if (unlikely(base >= top))
-               return;
-
-       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-               struct pebs_record_nhm *p = at;
-               u64 pebs_status;
-
-               /* PEBS v3 has accurate status bits */
-               if (x86_pmu.intel_cap.pebs_format >= 3) {
-                       for_each_set_bit(bit, (unsigned long *)&p->status,
-                                        MAX_PEBS_EVENTS)
-                               counts[bit]++;
-
-                       continue;
-               }
-
-               pebs_status = p->status & cpuc->pebs_enabled;
-               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
-               /*
-                * On some CPUs the PEBS status can be zero when PEBS is
-                * racing with clearing of GLOBAL_STATUS.
-                *
-                * Normally we would drop that record, but in the
-                * case when there is only a single active PEBS event
-                * we can assume it's for that event.
-                */
-               if (!pebs_status && cpuc->pebs_enabled &&
-                       !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-                       pebs_status = cpuc->pebs_enabled;
-
-               bit = find_first_bit((unsigned long *)&pebs_status,
-                                       x86_pmu.max_pebs_events);
-               if (bit >= x86_pmu.max_pebs_events)
-                       continue;
-
-               /*
-                * The PEBS hardware does not deal well with the situation
-                * when events happen near to each other and multiple bits
-                * are set. But it should happen rarely.
-                *
-                * If these events include one PEBS and multiple non-PEBS
-                * events, it doesn't impact PEBS record. The record will
-                * be handled normally. (slow path)
-                *
-                * If these events include two or more PEBS events, the
-                * records for the events can be collapsed into a single
-                * one, and it's not possible to reconstruct all events
-                * that caused the PEBS record. It's called collision.
-                * If collision happened, the record will be dropped.
-                */
-               if (p->status != (1ULL << bit)) {
-                       for_each_set_bit(i, (unsigned long *)&pebs_status,
-                                        x86_pmu.max_pebs_events)
-                               error[i]++;
-                       continue;
-               }
-
-               counts[bit]++;
-       }
-
-       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-               if ((counts[bit] == 0) && (error[bit] == 0))
-                       continue;
-
-               event = cpuc->events[bit];
-               WARN_ON_ONCE(!event);
-               WARN_ON_ONCE(!event->attr.precise_ip);
-
-               /* log dropped samples number */
-               if (error[bit])
-                       perf_log_lost_samples(event, error[bit]);
-
-               if (counts[bit]) {
-                       __intel_pmu_pebs_event(event, iregs, base,
-                                              top, bit, counts[bit]);
-               }
-       }
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-void __init intel_ds_init(void)
-{
-       /*
-        * No support for 32bit formats
-        */
-       if (!boot_cpu_has(X86_FEATURE_DTES64))
-               return;
-
-       x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
-       x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
-       if (x86_pmu.pebs) {
-               char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
-               int format = x86_pmu.intel_cap.pebs_format;
-
-               switch (format) {
-               case 0:
-                       printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-                       break;
-
-               case 1:
-                       printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       break;
-
-               case 2:
-                       pr_cont("PEBS fmt2%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       break;
-
-               case 3:
-                       pr_cont("PEBS fmt3%c, ", pebs_type);
-                       x86_pmu.pebs_record_size =
-                                               sizeof(struct pebs_record_skl);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
-                       break;
-
-               default:
-                       printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
-                       x86_pmu.pebs = 0;
-               }
-       }
-}
-
-void perf_restore_debug_store(void)
-{
-       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

deleted file mode 100644 (file)

index 653f88d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-enum {
-       LBR_FORMAT_32           = 0x00,
-       LBR_FORMAT_LIP          = 0x01,
-       LBR_FORMAT_EIP          = 0x02,
-       LBR_FORMAT_EIP_FLAGS    = 0x03,
-       LBR_FORMAT_EIP_FLAGS2   = 0x04,
-       LBR_FORMAT_INFO         = 0x05,
-       LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
-};
-
-static enum {
-       LBR_EIP_FLAGS           = 1,
-       LBR_TSX                 = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
-       [LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
-       [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
-/*
- * Intel LBR_SELECT bits
- * Intel Vol3a, April 2011, Section 16.7 Table 16-10
- *
- * Hardware branch filter (not available on all CPUs)
- */
-#define LBR_KERNEL_BIT         0 /* do not capture at ring0 */
-#define LBR_USER_BIT           1 /* do not capture at ring > 0 */
-#define LBR_JCC_BIT            2 /* do not capture conditional branches */
-#define LBR_REL_CALL_BIT       3 /* do not capture relative calls */
-#define LBR_IND_CALL_BIT       4 /* do not capture indirect calls */
-#define LBR_RETURN_BIT         5 /* do not capture near returns */
-#define LBR_IND_JMP_BIT                6 /* do not capture indirect jumps */
-#define LBR_REL_JMP_BIT                7 /* do not capture relative jumps */
-#define LBR_FAR_BIT            8 /* do not capture far branches */
-#define LBR_CALL_STACK_BIT     9 /* enable call stack */
-
-/*
- * Following bit only exists in Linux; we mask it out before writing it to
- * the actual MSR. But it helps the constraint perf code to understand
- * that this is a separate configuration.
- */
-#define LBR_NO_INFO_BIT               63 /* don't read LBR_INFO. */
-
-#define LBR_KERNEL     (1 << LBR_KERNEL_BIT)
-#define LBR_USER       (1 << LBR_USER_BIT)
-#define LBR_JCC                (1 << LBR_JCC_BIT)
-#define LBR_REL_CALL   (1 << LBR_REL_CALL_BIT)
-#define LBR_IND_CALL   (1 << LBR_IND_CALL_BIT)
-#define LBR_RETURN     (1 << LBR_RETURN_BIT)
-#define LBR_REL_JMP    (1 << LBR_REL_JMP_BIT)
-#define LBR_IND_JMP    (1 << LBR_IND_JMP_BIT)
-#define LBR_FAR                (1 << LBR_FAR_BIT)
-#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
-#define LBR_NO_INFO    (1ULL << LBR_NO_INFO_BIT)
-
-#define LBR_PLM (LBR_KERNEL | LBR_USER)
-
-#define LBR_SEL_MASK   0x1ff   /* valid bits in LBR_SELECT */
-#define LBR_NOT_SUPP   -1      /* LBR filter not supported */
-#define LBR_IGN                0       /* ignored */
-
-#define LBR_ANY                 \
-       (LBR_JCC        |\
-        LBR_REL_CALL   |\
-        LBR_IND_CALL   |\
-        LBR_RETURN     |\
-        LBR_REL_JMP    |\
-        LBR_IND_JMP    |\
-        LBR_FAR)
-
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
-
-/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
-       X86_BR_NONE             = 0,      /* unknown */
-
-       X86_BR_USER             = 1 << 0, /* branch target is user */
-       X86_BR_KERNEL           = 1 << 1, /* branch target is kernel */
-
-       X86_BR_CALL             = 1 << 2, /* call */
-       X86_BR_RET              = 1 << 3, /* return */
-       X86_BR_SYSCALL          = 1 << 4, /* syscall */
-       X86_BR_SYSRET           = 1 << 5, /* syscall return */
-       X86_BR_INT              = 1 << 6, /* sw interrupt */
-       X86_BR_IRET             = 1 << 7, /* return from interrupt */
-       X86_BR_JCC              = 1 << 8, /* conditional */
-       X86_BR_JMP              = 1 << 9, /* jump */
-       X86_BR_IRQ              = 1 << 10,/* hw interrupt or trap or fault */
-       X86_BR_IND_CALL         = 1 << 11,/* indirect calls */
-       X86_BR_ABORT            = 1 << 12,/* transaction abort */
-       X86_BR_IN_TX            = 1 << 13,/* in transaction */
-       X86_BR_NO_TX            = 1 << 14,/* not in transaction */
-       X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
-       X86_BR_CALL_STACK       = 1 << 16,/* call stack */
-       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY       \
-       (X86_BR_CALL    |\
-        X86_BR_RET     |\
-        X86_BR_SYSCALL |\
-        X86_BR_SYSRET  |\
-        X86_BR_INT     |\
-        X86_BR_IRET    |\
-        X86_BR_JCC     |\
-        X86_BR_JMP      |\
-        X86_BR_IRQ      |\
-        X86_BR_ABORT    |\
-        X86_BR_IND_CALL |\
-        X86_BR_IND_JMP  |\
-        X86_BR_ZERO_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL                 \
-       (X86_BR_CALL            |\
-        X86_BR_IND_CALL        |\
-        X86_BR_ZERO_CALL       |\
-        X86_BR_SYSCALL         |\
-        X86_BR_IRQ             |\
-        X86_BR_INT)
-
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       u64 debugctl, lbr_select = 0, orig_debugctl;
-
-       /*
-        * No need to unfreeze manually, as v4 can do that as part
-        * of the GLOBAL_STATUS ack.
-        */
-       if (pmi && x86_pmu.version >= 4)
-               return;
-
-       /*
-        * No need to reprogram LBR_SELECT in a PMI, as it
-        * did not change.
-        */
-       if (cpuc->lbr_sel)
-               lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-       if (!pmi && cpuc->lbr_sel)
-               wrmsrl(MSR_LBR_SELECT, lbr_select);
-
-       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-       orig_debugctl = debugctl;
-       debugctl |= DEBUGCTLMSR_LBR;
-       /*
-        * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
-        * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
-        * may cause superfluous increase/decrease of LBR_TOS.
-        */
-       if (!(lbr_select & LBR_CALL_STACK))
-               debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-       if (orig_debugctl != debugctl)
-               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
-       u64 debugctl;
-
-       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-       debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++)
-               wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               wrmsrl(x86_pmu.lbr_from + i, 0);
-               wrmsrl(x86_pmu.lbr_to   + i, 0);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       wrmsrl(MSR_LBR_INFO_0 + i, 0);
-       }
-}
-
-void intel_pmu_lbr_reset(void)
-{
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-               intel_pmu_lbr_reset_32();
-       else
-               intel_pmu_lbr_reset_64();
-}
-
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-       u64 tos;
-
-       rdmsrl(x86_pmu.lbr_tos, tos);
-       return tos;
-}
-
-enum {
-       LBR_NONE,
-       LBR_VALID,
-};
-
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
-{
-       int i;
-       unsigned lbr_idx, mask;
-       u64 tos;
-
-       if (task_ctx->lbr_callstack_users == 0 ||
-           task_ctx->lbr_stack_state == LBR_NONE) {
-               intel_pmu_lbr_reset();
-               return;
-       }
-
-       mask = x86_pmu.lbr_nr - 1;
-       tos = task_ctx->tos;
-       for (i = 0; i < tos; i++) {
-               lbr_idx = (tos - i) & mask;
-               wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-               wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-       }
-       wrmsrl(x86_pmu.lbr_tos, tos);
-       task_ctx->lbr_stack_state = LBR_NONE;
-}
-
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
-{
-       int i;
-       unsigned lbr_idx, mask;
-       u64 tos;
-
-       if (task_ctx->lbr_callstack_users == 0) {
-               task_ctx->lbr_stack_state = LBR_NONE;
-               return;
-       }
-
-       mask = x86_pmu.lbr_nr - 1;
-       tos = intel_pmu_lbr_tos();
-       for (i = 0; i < tos; i++) {
-               lbr_idx = (tos - i) & mask;
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-               rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-       }
-       task_ctx->tos = tos;
-       task_ctx->lbr_stack_state = LBR_VALID;
-}
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       /*
-        * If LBR callstack feature is enabled and the stack was saved when
-        * the task was scheduled out, restore the stack. Otherwise flush
-        * the LBR stack.
-        */
-       task_ctx = ctx ? ctx->task_ctx_data : NULL;
-       if (task_ctx) {
-               if (sched_in) {
-                       __intel_pmu_lbr_restore(task_ctx);
-                       cpuc->lbr_context = ctx;
-               } else {
-                       __intel_pmu_lbr_save(task_ctx);
-               }
-               return;
-       }
-
-       /*
-        * When sampling the branck stack in system-wide, it may be
-        * necessary to flush the stack on context switch. This happens
-        * when the branch stack does not tag its entries with the pid
-        * of the current task. Otherwise it becomes impossible to
-        * associate a branch entry with a task. This ambiguity is more
-        * likely to appear when the branch stack supports priv level
-        * filtering and the user sets it to monitor only at the user
-        * level (which could be a useful measurement in system-wide
-        * mode). In that case, the risk is high of having a branch
-        * stack with branch from multiple tasks.
-        */
-       if (sched_in) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = ctx;
-       }
-}
-
-static inline bool branch_user_callstack(unsigned br_sel)
-{
-       return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
-}
-
-void intel_pmu_lbr_enable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       /*
-        * Reset the LBR stack if we changed task context to
-        * avoid data leaks.
-        */
-       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = event->ctx;
-       }
-       cpuc->br_sel = event->hw.branch_reg.reg;
-
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
-               task_ctx = event->ctx->task_ctx_data;
-               task_ctx->lbr_callstack_users++;
-       }
-
-       cpuc->lbr_users++;
-       perf_sched_cb_inc(event->ctx->pmu);
-}
-
-void intel_pmu_lbr_disable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
-               task_ctx = event->ctx->task_ctx_data;
-               task_ctx->lbr_callstack_users--;
-       }
-
-       cpuc->lbr_users--;
-       WARN_ON_ONCE(cpuc->lbr_users < 0);
-       perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled && !cpuc->lbr_users) {
-               __intel_pmu_lbr_disable();
-               /* avoid stale pointer */
-               cpuc->lbr_context = NULL;
-       }
-}
-
-void intel_pmu_lbr_enable_all(bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->lbr_users)
-               __intel_pmu_lbr_enable(pmi);
-}
-
-void intel_pmu_lbr_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->lbr_users)
-               __intel_pmu_lbr_disable();
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
-       unsigned long mask = x86_pmu.lbr_nr - 1;
-       u64 tos = intel_pmu_lbr_tos();
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               unsigned long lbr_idx = (tos - i) & mask;
-               union {
-                       struct {
-                               u32 from;
-                               u32 to;
-                       };
-                       u64     lbr;
-               } msr_lastbranch;
-
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
-               cpuc->lbr_entries[i].from       = msr_lastbranch.from;
-               cpuc->lbr_entries[i].to         = msr_lastbranch.to;
-               cpuc->lbr_entries[i].mispred    = 0;
-               cpuc->lbr_entries[i].predicted  = 0;
-               cpuc->lbr_entries[i].reserved   = 0;
-       }
-       cpuc->lbr_stack.nr = i;
-}
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
-       bool need_info = false;
-       unsigned long mask = x86_pmu.lbr_nr - 1;
-       int lbr_format = x86_pmu.intel_cap.lbr_format;
-       u64 tos = intel_pmu_lbr_tos();
-       int i;
-       int out = 0;
-       int num = x86_pmu.lbr_nr;
-
-       if (cpuc->lbr_sel) {
-               need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
-               if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-                       num = tos;
-       }
-
-       for (i = 0; i < num; i++) {
-               unsigned long lbr_idx = (tos - i) & mask;
-               u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
-               int skip = 0;
-               u16 cycles = 0;
-               int lbr_flags = lbr_desc[lbr_format];
-
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
-               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
-
-               if (lbr_format == LBR_FORMAT_INFO && need_info) {
-                       u64 info;
-
-                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
-                       mis = !!(info & LBR_INFO_MISPRED);
-                       pred = !mis;
-                       in_tx = !!(info & LBR_INFO_IN_TX);
-                       abort = !!(info & LBR_INFO_ABORT);
-                       cycles = (info & LBR_INFO_CYCLES);
-               }
-               if (lbr_flags & LBR_EIP_FLAGS) {
-                       mis = !!(from & LBR_FROM_FLAG_MISPRED);
-                       pred = !mis;
-                       skip = 1;
-               }
-               if (lbr_flags & LBR_TSX) {
-                       in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
-                       abort = !!(from & LBR_FROM_FLAG_ABORT);
-                       skip = 3;
-               }
-               from = (u64)((((s64)from) << skip) >> skip);
-
-               /*
-                * Some CPUs report duplicated abort records,
-                * with the second entry not having an abort bit set.
-                * Skip them here. This loop runs backwards,
-                * so we need to undo the previous record.
-                * If the abort just happened outside the window
-                * the extra entry cannot be removed.
-                */
-               if (abort && x86_pmu.lbr_double_abort && out > 0)
-                       out--;
-
-               cpuc->lbr_entries[out].from      = from;
-               cpuc->lbr_entries[out].to        = to;
-               cpuc->lbr_entries[out].mispred   = mis;
-               cpuc->lbr_entries[out].predicted = pred;
-               cpuc->lbr_entries[out].in_tx     = in_tx;
-               cpuc->lbr_entries[out].abort     = abort;
-               cpuc->lbr_entries[out].cycles    = cycles;
-               cpuc->lbr_entries[out].reserved  = 0;
-               out++;
-       }
-       cpuc->lbr_stack.nr = out;
-}
-
-void intel_pmu_lbr_read(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (!cpuc->lbr_users)
-               return;
-
-       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-               intel_pmu_lbr_read_32(cpuc);
-       else
-               intel_pmu_lbr_read_64(cpuc);
-
-       intel_pmu_lbr_filter(cpuc);
-}
-
-/*
- * SW filter is used:
- * - in case there is no HW filter
- * - in case the HW filter has errata or limitations
- */
-static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
-{
-       u64 br_type = event->attr.branch_sample_type;
-       int mask = 0;
-
-       if (br_type & PERF_SAMPLE_BRANCH_USER)
-               mask |= X86_BR_USER;
-
-       if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
-               mask |= X86_BR_KERNEL;
-
-       /* we ignore BRANCH_HV here */
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY)
-               mask |= X86_BR_ANY;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
-               mask |= X86_BR_ANY_CALL;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
-               mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
-
-       if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
-               mask |= X86_BR_IND_CALL;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
-               mask |= X86_BR_ABORT;
-
-       if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
-               mask |= X86_BR_IN_TX;
-
-       if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
-               mask |= X86_BR_NO_TX;
-
-       if (br_type & PERF_SAMPLE_BRANCH_COND)
-               mask |= X86_BR_JCC;
-
-       if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
-               if (!x86_pmu_has_lbr_callstack())
-                       return -EOPNOTSUPP;
-               if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
-                       return -EINVAL;
-               mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
-                       X86_BR_CALL_STACK;
-       }
-
-       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
-               mask |= X86_BR_IND_JMP;
-
-       if (br_type & PERF_SAMPLE_BRANCH_CALL)
-               mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
-       /*
-        * stash actual user request into reg, it may
-        * be used by fixup code for some CPU
-        */
-       event->hw.branch_reg.reg = mask;
-       return 0;
-}
-
-/*
- * setup the HW LBR filter
- * Used only when available, may not be enough to disambiguate
- * all branches, may need the help of the SW filter
- */
-static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-       u64 br_type = event->attr.branch_sample_type;
-       u64 mask = 0, v;
-       int i;
-
-       for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
-               if (!(br_type & (1ULL << i)))
-                       continue;
-
-               v = x86_pmu.lbr_sel_map[i];
-               if (v == LBR_NOT_SUPP)
-                       return -EOPNOTSUPP;
-
-               if (v != LBR_IGN)
-                       mask |= v;
-       }
-
-       reg = &event->hw.branch_reg;
-       reg->idx = EXTRA_REG_LBR;
-
-       /*
-        * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
-        * in suppress mode. So LBR_SELECT should be set to
-        * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
-        */
-       reg->config = mask ^ x86_pmu.lbr_sel_mask;
-
-       if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
-           (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
-           (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
-               reg->config |= LBR_NO_INFO;
-
-       return 0;
-}
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event)
-{
-       int ret = 0;
-
-       /*
-        * no LBR on this PMU
-        */
-       if (!x86_pmu.lbr_nr)
-               return -EOPNOTSUPP;
-
-       /*
-        * setup SW LBR filter
-        */
-       ret = intel_pmu_setup_sw_lbr_filter(event);
-       if (ret)
-               return ret;
-
-       /*
-        * setup HW LBR filter, if any
-        */
-       if (x86_pmu.lbr_sel_map)
-               ret = intel_pmu_setup_hw_lbr_filter(event);
-
-       return ret;
-}
-
-/*
- * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
-       struct insn insn;
-       void *addr;
-       int bytes_read, bytes_left;
-       int ret = X86_BR_NONE;
-       int ext, to_plm, from_plm;
-       u8 buf[MAX_INSN_SIZE];
-       int is64 = 0;
-
-       to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
-       from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
-       /*
-        * maybe zero if lbr did not fill up after a reset by the time
-        * we get a PMU interrupt
-        */
-       if (from == 0 || to == 0)
-               return X86_BR_NONE;
-
-       if (abort)
-               return X86_BR_ABORT | to_plm;
-
-       if (from_plm == X86_BR_USER) {
-               /*
-                * can happen if measuring at the user level only
-                * and we interrupt in a kernel thread, e.g., idle.
-                */
-               if (!current->mm)
-                       return X86_BR_NONE;
-
-               /* may fail if text not present */
-               bytes_left = copy_from_user_nmi(buf, (void __user *)from,
-                                               MAX_INSN_SIZE);
-               bytes_read = MAX_INSN_SIZE - bytes_left;
-               if (!bytes_read)
-                       return X86_BR_NONE;
-
-               addr = buf;
-       } else {
-               /*
-                * The LBR logs any address in the IP, even if the IP just
-                * faulted. This means userspace can control the from address.
-                * Ensure we don't blindy read any address by validating it is
-                * a known text address.
-                */
-               if (kernel_text_address(from)) {
-                       addr = (void *)from;
-                       /*
-                        * Assume we can get the maximum possible size
-                        * when grabbing kernel data.  This is not
-                        * _strictly_ true since we could possibly be
-                        * executing up next to a memory hole, but
-                        * it is very unlikely to be a problem.
-                        */
-                       bytes_read = MAX_INSN_SIZE;
-               } else {
-                       return X86_BR_NONE;
-               }
-       }
-
-       /*
-        * decoder needs to know the ABI especially
-        * on 64-bit systems running 32-bit apps
-        */
-#ifdef CONFIG_X86_64
-       is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
-#endif
-       insn_init(&insn, addr, bytes_read, is64);
-       insn_get_opcode(&insn);
-       if (!insn.opcode.got)
-               return X86_BR_ABORT;
-
-       switch (insn.opcode.bytes[0]) {
-       case 0xf:
-               switch (insn.opcode.bytes[1]) {
-               case 0x05: /* syscall */
-               case 0x34: /* sysenter */
-                       ret = X86_BR_SYSCALL;
-                       break;
-               case 0x07: /* sysret */
-               case 0x35: /* sysexit */
-                       ret = X86_BR_SYSRET;
-                       break;
-               case 0x80 ... 0x8f: /* conditional */
-                       ret = X86_BR_JCC;
-                       break;
-               default:
-                       ret = X86_BR_NONE;
-               }
-               break;
-       case 0x70 ... 0x7f: /* conditional */
-               ret = X86_BR_JCC;
-               break;
-       case 0xc2: /* near ret */
-       case 0xc3: /* near ret */
-       case 0xca: /* far ret */
-       case 0xcb: /* far ret */
-               ret = X86_BR_RET;
-               break;
-       case 0xcf: /* iret */
-               ret = X86_BR_IRET;
-               break;
-       case 0xcc ... 0xce: /* int */
-               ret = X86_BR_INT;
-               break;
-       case 0xe8: /* call near rel */
-               insn_get_immediate(&insn);
-               if (insn.immediate1.value == 0) {
-                       /* zero length call */
-                       ret = X86_BR_ZERO_CALL;
-                       break;
-               }
-       case 0x9a: /* call far absolute */
-               ret = X86_BR_CALL;
-               break;
-       case 0xe0 ... 0xe3: /* loop jmp */
-               ret = X86_BR_JCC;
-               break;
-       case 0xe9 ... 0xeb: /* jmp */
-               ret = X86_BR_JMP;
-               break;
-       case 0xff: /* call near absolute, call far absolute ind */
-               insn_get_modrm(&insn);
-               ext = (insn.modrm.bytes[0] >> 3) & 0x7;
-               switch (ext) {
-               case 2: /* near ind call */
-               case 3: /* far ind call */
-                       ret = X86_BR_IND_CALL;
-                       break;
-               case 4:
-               case 5:
-                       ret = X86_BR_IND_JMP;
-                       break;
-               }
-               break;
-       default:
-               ret = X86_BR_NONE;
-       }
-       /*
-        * interrupts, traps, faults (and thus ring transition) may
-        * occur on any instructions. Thus, to classify them correctly,
-        * we need to first look at the from and to priv levels. If they
-        * are different and to is in the kernel, then it indicates
-        * a ring transition. If the from instruction is not a ring
-        * transition instr (syscall, systenter, int), then it means
-        * it was a irq, trap or fault.
-        *
-        * we have no way of detecting kernel to kernel faults.
-        */
-       if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
-           && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
-               ret = X86_BR_IRQ;
-
-       /*
-        * branch priv level determined by target as
-        * is done by HW when LBR_SELECT is implemented
-        */
-       if (ret != X86_BR_NONE)
-               ret |= to_plm;
-
-       return ret;
-}
-
-/*
- * implement actual branch filter based on user demand.
- * Hardware may not exactly satisfy that request, thus
- * we need to inspect opcodes. Mismatched branches are
- * discarded. Therefore, the number of branches returned
- * in PERF_SAMPLE_BRANCH_STACK sample may vary.
- */
-static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
-{
-       u64 from, to;
-       int br_sel = cpuc->br_sel;
-       int i, j, type;
-       bool compress = false;
-
-       /* if sampling all branches, then nothing to filter */
-       if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
-               return;
-
-       for (i = 0; i < cpuc->lbr_stack.nr; i++) {
-
-               from = cpuc->lbr_entries[i].from;
-               to = cpuc->lbr_entries[i].to;
-
-               type = branch_type(from, to, cpuc->lbr_entries[i].abort);
-               if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
-                       if (cpuc->lbr_entries[i].in_tx)
-                               type |= X86_BR_IN_TX;
-                       else
-                               type |= X86_BR_NO_TX;
-               }
-
-               /* if type does not correspond, then discard */
-               if (type == X86_BR_NONE || (br_sel & type) != type) {
-                       cpuc->lbr_entries[i].from = 0;
-                       compress = true;
-               }
-       }
-
-       if (!compress)
-               return;
-
-       /* remove all entries with from=0 */
-       for (i = 0; i < cpuc->lbr_stack.nr; ) {
-               if (!cpuc->lbr_entries[i].from) {
-                       j = i;
-                       while (++j < cpuc->lbr_stack.nr)
-                               cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
-                       cpuc->lbr_stack.nr--;
-                       if (!cpuc->lbr_entries[i].from)
-                               continue;
-               }
-               i++;
-       }
-}
-
-/*
- * Map interface branch filters onto LBR filters
- */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_REL_JMP
-                                               | LBR_IND_JMP | LBR_FAR,
-       /*
-        * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
-        */
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
-        LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
-       /*
-        * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
-        */
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
-};
-
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
-};
-
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_RETURN | LBR_CALL_STACK,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
-};
-
-/* core */
-void __init intel_pmu_lbr_init_core(void)
-{
-       x86_pmu.lbr_nr     = 4;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-       /*
-        * SW branch filter usage:
-        * - compensate for lack of HW filter
-        */
-       pr_cont("4-deep LBR, ");
-}
-
-/* nehalem/westmere */
-void __init intel_pmu_lbr_init_nhm(void)
-{
-       x86_pmu.lbr_nr     = 16;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - workaround LBR_SEL errata (see above)
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("16-deep LBR, ");
-}
-
-/* sandy bridge */
-void __init intel_pmu_lbr_init_snb(void)
-{
-       x86_pmu.lbr_nr   = 16;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("16-deep LBR, ");
-}
-
-/* haswell */
-void intel_pmu_lbr_init_hsw(void)
-{
-       x86_pmu.lbr_nr   = 16;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-       pr_cont("16-deep LBR, ");
-}
-
-/* skylake */
-__init void intel_pmu_lbr_init_skl(void)
-{
-       x86_pmu.lbr_nr   = 32;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("32-deep LBR, ");
-}
-
-/* atom */
-void __init intel_pmu_lbr_init_atom(void)
-{
-       /*
-        * only models starting at stepping 10 seems
-        * to have an operational LBR which can freeze
-        * on PMU interrupt
-        */
-       if (boot_cpu_data.x86_model == 28
-           && boot_cpu_data.x86_mask < 10) {
-               pr_cont("LBR disabled due to erratum");
-               return;
-       }
-
-       x86_pmu.lbr_nr     = 8;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-       /*
-        * SW branch filter usage:
-        * - compensate for lack of HW filter
-        */
-       pr_cont("8-deep LBR, ");
-}
-
-/* Knights Landing */
-void intel_pmu_lbr_init_knl(void)
-{
-       x86_pmu.lbr_nr     = 8;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-       pr_cont("8-deep LBR, ");
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c

deleted file mode 100644 (file)

index c0bbd10..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ /dev/null
@@ -1,1188 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/intel_pt.h>
-
-#include "perf_event.h"
-#include "intel_pt.h"
-
-static DEFINE_PER_CPU(struct pt, pt_ctx);
-
-static struct pt_pmu pt_pmu;
-
-enum cpuid_regs {
-       CR_EAX = 0,
-       CR_ECX,
-       CR_EDX,
-       CR_EBX
-};
-
-/*
- * Capabilities of Intel PT hardware, such as number of address bits or
- * supported output schemes, are cached and exported to userspace as "caps"
- * attribute group of pt pmu device
- * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
- * relevant bits together with intel_pt traces.
- *
- * These are necessary for both trace decoding (payloads_lip, contains address
- * width encoded in IP-related packets), and event configuration (bitmasks with
- * permitted values for certain bit fields).
- */
-#define PT_CAP(_n, _l, _r, _m)                                         \
-       [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
-                           .reg = _r, .mask = _m }
-
-static struct pt_cap_desc {
-       const char      *name;
-       u32             leaf;
-       u8              reg;
-       u32             mask;
-} pt_caps[] = {
-       PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
-       PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
-       PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
-       PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
-       PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
-       PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
-       PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
-       PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
-       PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
-       PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
-       PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
-};
-
-static u32 pt_cap_get(enum pt_capabilities cap)
-{
-       struct pt_cap_desc *cd = &pt_caps[cap];
-       u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
-       unsigned int shift = __ffs(cd->mask);
-
-       return (c & cd->mask) >> shift;
-}
-
-static ssize_t pt_cap_show(struct device *cdev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct dev_ext_attribute *ea =
-               container_of(attr, struct dev_ext_attribute, attr);
-       enum pt_capabilities cap = (long)ea->var;
-
-       return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
-}
-
-static struct attribute_group pt_cap_group = {
-       .name   = "caps",
-};
-
-PMU_FORMAT_ATTR(cyc,           "config:1"      );
-PMU_FORMAT_ATTR(mtc,           "config:9"      );
-PMU_FORMAT_ATTR(tsc,           "config:10"     );
-PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
-PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
-PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
-PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
-
-static struct attribute *pt_formats_attr[] = {
-       &format_attr_cyc.attr,
-       &format_attr_mtc.attr,
-       &format_attr_tsc.attr,
-       &format_attr_noretcomp.attr,
-       &format_attr_mtc_period.attr,
-       &format_attr_cyc_thresh.attr,
-       &format_attr_psb_period.attr,
-       NULL,
-};
-
-static struct attribute_group pt_format_group = {
-       .name   = "format",
-       .attrs  = pt_formats_attr,
-};
-
-static const struct attribute_group *pt_attr_groups[] = {
-       &pt_cap_group,
-       &pt_format_group,
-       NULL,
-};
-
-static int __init pt_pmu_hw_init(void)
-{
-       struct dev_ext_attribute *de_attrs;
-       struct attribute **attrs;
-       size_t size;
-       int ret;
-       long i;
-
-       attrs = NULL;
-
-       for (i = 0; i < PT_CPUID_LEAVES; i++) {
-               cpuid_count(20, i,
-                           &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
-       }
-
-       ret = -ENOMEM;
-       size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
-       attrs = kzalloc(size, GFP_KERNEL);
-       if (!attrs)
-               goto fail;
-
-       size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
-       de_attrs = kzalloc(size, GFP_KERNEL);
-       if (!de_attrs)
-               goto fail;
-
-       for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-               struct dev_ext_attribute *de_attr = de_attrs + i;
-
-               de_attr->attr.attr.name = pt_caps[i].name;
-
-               sysfs_attr_init(&de_attr->attr.attr);
-
-               de_attr->attr.attr.mode         = S_IRUGO;
-               de_attr->attr.show              = pt_cap_show;
-               de_attr->var                    = (void *)i;
-
-               attrs[i] = &de_attr->attr.attr;
-       }
-
-       pt_cap_group.attrs = attrs;
-
-       return 0;
-
-fail:
-       kfree(attrs);
-
-       return ret;
-}
-
-#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC    | \
-                         RTIT_CTL_CYC_THRESH   | \
-                         RTIT_CTL_PSB_FREQ)
-
-#define RTIT_CTL_MTC   (RTIT_CTL_MTC_EN        | \
-                        RTIT_CTL_MTC_RANGE)
-
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
-                       RTIT_CTL_DISRETC        | \
-                       RTIT_CTL_CYC_PSB        | \
-                       RTIT_CTL_MTC)
-
-static bool pt_event_valid(struct perf_event *event)
-{
-       u64 config = event->attr.config;
-       u64 allowed, requested;
-
-       if ((config & PT_CONFIG_MASK) != config)
-               return false;
-
-       if (config & RTIT_CTL_CYC_PSB) {
-               if (!pt_cap_get(PT_CAP_psb_cyc))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_psb_periods);
-               requested = (config & RTIT_CTL_PSB_FREQ) >>
-                       RTIT_CTL_PSB_FREQ_OFFSET;
-               if (requested && (!(allowed & BIT(requested))))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_cycle_thresholds);
-               requested = (config & RTIT_CTL_CYC_THRESH) >>
-                       RTIT_CTL_CYC_THRESH_OFFSET;
-               if (requested && (!(allowed & BIT(requested))))
-                       return false;
-       }
-
-       if (config & RTIT_CTL_MTC) {
-               /*
-                * In the unlikely case that CPUID lists valid mtc periods,
-                * but not the mtc capability, drop out here.
-                *
-                * Spec says that setting mtc period bits while mtc bit in
-                * CPUID is 0 will #GP, so better safe than sorry.
-                */
-               if (!pt_cap_get(PT_CAP_mtc))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_mtc_periods);
-               if (!allowed)
-                       return false;
-
-               requested = (config & RTIT_CTL_MTC_RANGE) >>
-                       RTIT_CTL_MTC_RANGE_OFFSET;
-
-               if (!(allowed & BIT(requested)))
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * PT configuration helpers
- * These all are cpu affine and operate on a local PT
- */
-
-static void pt_config(struct perf_event *event)
-{
-       u64 reg;
-
-       if (!event->hw.itrace_started) {
-               event->hw.itrace_started = 1;
-               wrmsrl(MSR_IA32_RTIT_STATUS, 0);
-       }
-
-       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
-
-       if (!event->attr.exclude_kernel)
-               reg |= RTIT_CTL_OS;
-       if (!event->attr.exclude_user)
-               reg |= RTIT_CTL_USR;
-
-       reg |= (event->attr.config & PT_CONFIG_MASK);
-
-       wrmsrl(MSR_IA32_RTIT_CTL, reg);
-}
-
-static void pt_config_start(bool start)
-{
-       u64 ctl;
-
-       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-       if (start)
-               ctl |= RTIT_CTL_TRACEEN;
-       else
-               ctl &= ~RTIT_CTL_TRACEEN;
-       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-       /*
-        * A wrmsr that disables trace generation serializes other PT
-        * registers and causes all data packets to be written to memory,
-        * but a fence is required for the data to become globally visible.
-        *
-        * The below WMB, separating data store and aux_head store matches
-        * the consumer's RMB that separates aux_head load and data load.
-        */
-       if (!start)
-               wmb();
-}
-
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-                            unsigned int output_off)
-{
-       u64 reg;
-
-       wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-       reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-       wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
-/*
- * Keep ToPA table-related metadata on the same page as the actual table,
- * taking up a few words from the top
- */
-
-#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
-
-/**
- * struct topa - page-sized ToPA table with metadata at the top
- * @table:     actual ToPA table entries, as understood by PT hardware
- * @list:      linkage to struct pt_buffer's list of tables
- * @phys:      physical address of this page
- * @offset:    offset of the first entry in this table in the buffer
- * @size:      total size of all entries in this table
- * @last:      index of the last initialized entry in this table
- */
-struct topa {
-       struct topa_entry       table[TENTS_PER_PAGE];
-       struct list_head        list;
-       u64                     phys;
-       u64                     offset;
-       size_t                  size;
-       int                     last;
-};
-
-/* make -1 stand for the last table entry */
-#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
-
-/**
- * topa_alloc() - allocate page-sized ToPA table
- * @cpu:       CPU on which to allocate.
- * @gfp:       Allocation flags.
- *
- * Return:     On success, return the pointer to ToPA table page.
- */
-static struct topa *topa_alloc(int cpu, gfp_t gfp)
-{
-       int node = cpu_to_node(cpu);
-       struct topa *topa;
-       struct page *p;
-
-       p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-       if (!p)
-               return NULL;
-
-       topa = page_address(p);
-       topa->last = 0;
-       topa->phys = page_to_phys(p);
-
-       /*
-        * In case of singe-entry ToPA, always put the self-referencing END
-        * link as the 2nd entry in the table
-        */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
-               TOPA_ENTRY(topa, 1)->end = 1;
-       }
-
-       return topa;
-}
-
-/**
- * topa_free() - free a page-sized ToPA table
- * @topa:      Table to deallocate.
- */
-static void topa_free(struct topa *topa)
-{
-       free_page((unsigned long)topa);
-}
-
-/**
- * topa_insert_table() - insert a ToPA table into a buffer
- * @buf:        PT buffer that's being extended.
- * @topa:       New topa table to be inserted.
- *
- * If it's the first table in this buffer, set up buffer's pointers
- * accordingly; otherwise, add a END=1 link entry to @topa to the current
- * "last" table and adjust the last table pointer to @topa.
- */
-static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
-{
-       struct topa *last = buf->last;
-
-       list_add_tail(&topa->list, &buf->tables);
-
-       if (!buf->first) {
-               buf->first = buf->last = buf->cur = topa;
-               return;
-       }
-
-       topa->offset = last->offset + last->size;
-       buf->last = topa;
-
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return;
-
-       BUG_ON(last->last != TENTS_PER_PAGE - 1);
-
-       TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
-       TOPA_ENTRY(last, -1)->end = 1;
-}
-
-/**
- * topa_table_full() - check if a ToPA table is filled up
- * @topa:      ToPA table.
- */
-static bool topa_table_full(struct topa *topa)
-{
-       /* single-entry ToPA is a special case */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return !!topa->last;
-
-       return topa->last == TENTS_PER_PAGE - 1;
-}
-
-/**
- * topa_insert_pages() - create a list of ToPA tables
- * @buf:       PT buffer being initialized.
- * @gfp:       Allocation flags.
- *
- * This initializes a list of ToPA tables with entries from
- * the data_pages provided by rb_alloc_aux().
- *
- * Return:     0 on success or error code.
- */
-static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
-{
-       struct topa *topa = buf->last;
-       int order = 0;
-       struct page *p;
-
-       p = virt_to_page(buf->data_pages[buf->nr_pages]);
-       if (PagePrivate(p))
-               order = page_private(p);
-
-       if (topa_table_full(topa)) {
-               topa = topa_alloc(buf->cpu, gfp);
-               if (!topa)
-                       return -ENOMEM;
-
-               topa_insert_table(buf, topa);
-       }
-
-       TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
-       TOPA_ENTRY(topa, -1)->size = order;
-       if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(topa, -1)->intr = 1;
-               TOPA_ENTRY(topa, -1)->stop = 1;
-       }
-
-       topa->last++;
-       topa->size += sizes(order);
-
-       buf->nr_pages += 1ul << order;
-
-       return 0;
-}
-
-/**
- * pt_topa_dump() - print ToPA tables and their entries
- * @buf:       PT buffer.
- */
-static void pt_topa_dump(struct pt_buffer *buf)
-{
-       struct topa *topa;
-
-       list_for_each_entry(topa, &buf->tables, list) {
-               int i;
-
-               pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
-                        topa->phys, topa->offset, topa->size);
-               for (i = 0; i < TENTS_PER_PAGE; i++) {
-                       pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
-                                &topa->table[i],
-                                (unsigned long)topa->table[i].base << TOPA_SHIFT,
-                                sizes(topa->table[i].size),
-                                topa->table[i].end ?  'E' : ' ',
-                                topa->table[i].intr ? 'I' : ' ',
-                                topa->table[i].stop ? 'S' : ' ',
-                                *(u64 *)&topa->table[i]);
-                       if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
-                            topa->table[i].stop) ||
-                           topa->table[i].end)
-                               break;
-               }
-       }
-}
-
-/**
- * pt_buffer_advance() - advance to the next output region
- * @buf:       PT buffer.
- *
- * Advance the current pointers in the buffer to the next ToPA entry.
- */
-static void pt_buffer_advance(struct pt_buffer *buf)
-{
-       buf->output_off = 0;
-       buf->cur_idx++;
-
-       if (buf->cur_idx == buf->cur->last) {
-               if (buf->cur == buf->last)
-                       buf->cur = buf->first;
-               else
-                       buf->cur = list_entry(buf->cur->list.next, struct topa,
-                                             list);
-               buf->cur_idx = 0;
-       }
-}
-
-/**
- * pt_update_head() - calculate current offsets and sizes
- * @pt:                Per-cpu pt context.
- *
- * Update buffer's current write pointer position and data size.
- */
-static void pt_update_head(struct pt *pt)
-{
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-       u64 topa_idx, base, old;
-
-       /* offset of the first region in this table from the beginning of buf */
-       base = buf->cur->offset + buf->output_off;
-
-       /* offset of the current output region within this table */
-       for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
-               base += sizes(buf->cur->table[topa_idx].size);
-
-       if (buf->snapshot) {
-               local_set(&buf->data_size, base);
-       } else {
-               old = (local64_xchg(&buf->head, base) &
-                      ((buf->nr_pages << PAGE_SHIFT) - 1));
-               if (base < old)
-                       base += buf->nr_pages << PAGE_SHIFT;
-
-               local_add(base - old, &buf->data_size);
-       }
-}
-
-/**
- * pt_buffer_region() - obtain current output region's address
- * @buf:       PT buffer.
- */
-static void *pt_buffer_region(struct pt_buffer *buf)
-{
-       return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
-}
-
-/**
- * pt_buffer_region_size() - obtain current output region's size
- * @buf:       PT buffer.
- */
-static size_t pt_buffer_region_size(struct pt_buffer *buf)
-{
-       return sizes(buf->cur->table[buf->cur_idx].size);
-}
-
-/**
- * pt_handle_status() - take care of possible status conditions
- * @pt:                Per-cpu pt context.
- */
-static void pt_handle_status(struct pt *pt)
-{
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-       int advance = 0;
-       u64 status;
-
-       rdmsrl(MSR_IA32_RTIT_STATUS, status);
-
-       if (status & RTIT_STATUS_ERROR) {
-               pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
-               pt_topa_dump(buf);
-               status &= ~RTIT_STATUS_ERROR;
-       }
-
-       if (status & RTIT_STATUS_STOPPED) {
-               status &= ~RTIT_STATUS_STOPPED;
-
-               /*
-                * On systems that only do single-entry ToPA, hitting STOP
-                * means we are already losing data; need to let the decoder
-                * know.
-                */
-               if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
-                   buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-                       local_inc(&buf->lost);
-                       advance++;
-               }
-       }
-
-       /*
-        * Also on single-entry ToPA implementations, interrupt will come
-        * before the output reaches its output region's boundary.
-        */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
-           pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
-               void *head = pt_buffer_region(buf);
-
-               /* everything within this margin needs to be zeroed out */
-               memset(head + buf->output_off, 0,
-                      pt_buffer_region_size(buf) -
-                      buf->output_off);
-               advance++;
-       }
-
-       if (advance)
-               pt_buffer_advance(buf);
-
-       wrmsrl(MSR_IA32_RTIT_STATUS, status);
-}
-
-/**
- * pt_read_offset() - translate registers into buffer pointers
- * @buf:       PT buffer.
- *
- * Set buffer's output pointers from MSR values.
- */
-static void pt_read_offset(struct pt_buffer *buf)
-{
-       u64 offset, base_topa;
-
-       rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-       buf->cur = phys_to_virt(base_topa);
-
-       rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
-       /* offset within current output region */
-       buf->output_off = offset >> 32;
-       /* index of current output region within this table */
-       buf->cur_idx = (offset & 0xffffff80) >> 7;
-}
-
-/**
- * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
- * @buf:       PT buffer.
- * @pg:                Page offset in the buffer.
- *
- * When advancing to the next output region (ToPA entry), given a page offset
- * into the buffer, we need to find the offset of the first page in the next
- * region.
- */
-static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
-{
-       struct topa_entry *te = buf->topa_index[pg];
-
-       /* one region */
-       if (buf->first == buf->last && buf->first->last == 1)
-               return pg;
-
-       do {
-               pg++;
-               pg &= buf->nr_pages - 1;
-       } while (buf->topa_index[pg] == te);
-
-       return pg;
-}
-
-/**
- * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
- * @buf:       PT buffer.
- * @handle:    Current output handle.
- *
- * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected and waking up the consumer after a certain fraction of
- * the buffer has filled up. Only needed and sensible for non-snapshot counters.
- *
- * This obviously relies on buf::head to figure out buffer markers, so it has
- * to be called after pt_buffer_reset_offsets() and before the hardware tracing
- * is enabled.
- */
-static int pt_buffer_reset_markers(struct pt_buffer *buf,
-                                  struct perf_output_handle *handle)
-
-{
-       unsigned long head = local64_read(&buf->head);
-       unsigned long idx, npages, wakeup;
-
-       /* can't stop in the middle of an output region */
-       if (buf->output_off + handle->size + 1 <
-           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
-               return -EINVAL;
-
-
-       /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return 0;
-
-       /* clear STOP and INT from current entry */
-       buf->topa_index[buf->stop_pos]->stop = 0;
-       buf->topa_index[buf->intr_pos]->intr = 0;
-
-       /* how many pages till the STOP marker */
-       npages = handle->size >> PAGE_SHIFT;
-
-       /* if it's on a page boundary, fill up one more page */
-       if (!offset_in_page(head + handle->size + 1))
-               npages++;
-
-       idx = (head >> PAGE_SHIFT) + npages;
-       idx &= buf->nr_pages - 1;
-       buf->stop_pos = idx;
-
-       wakeup = handle->wakeup >> PAGE_SHIFT;
-
-       /* in the worst case, wake up the consumer one page before hard stop */
-       idx = (head >> PAGE_SHIFT) + npages - 1;
-       if (idx > wakeup)
-               idx = wakeup;
-
-       idx &= buf->nr_pages - 1;
-       buf->intr_pos = idx;
-
-       buf->topa_index[buf->stop_pos]->stop = 1;
-       buf->topa_index[buf->intr_pos]->intr = 1;
-
-       return 0;
-}
-
-/**
- * pt_buffer_setup_topa_index() - build topa_index[] table of regions
- * @buf:       PT buffer.
- *
- * topa_index[] references output regions indexed by offset into the
- * buffer for purposes of quick reverse lookup.
- */
-static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
-{
-       struct topa *cur = buf->first, *prev = buf->last;
-       struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
-               *te_prev = TOPA_ENTRY(prev, prev->last - 1);
-       int pg = 0, idx = 0;
-
-       while (pg < buf->nr_pages) {
-               int tidx;
-
-               /* pages within one topa entry */
-               for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
-                       buf->topa_index[pg] = te_prev;
-
-               te_prev = te_cur;
-
-               if (idx == cur->last - 1) {
-                       /* advance to next topa table */
-                       idx = 0;
-                       cur = list_entry(cur->list.next, struct topa, list);
-               } else {
-                       idx++;
-               }
-               te_cur = TOPA_ENTRY(cur, idx);
-       }
-
-}
-
-/**
- * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
- * @buf:       PT buffer.
- * @head:      Write pointer (aux_head) from AUX buffer.
- *
- * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly. This is done after we have obtained the
- * current aux_head position from a successful call to perf_aux_output_begin()
- * to make sure the hardware is writing to the right place.
- *
- * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
- * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
- * which are used to determine INT and STOP markers' locations by a subsequent
- * call to pt_buffer_reset_markers().
- */
-static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
-{
-       int pg;
-
-       if (buf->snapshot)
-               head &= (buf->nr_pages << PAGE_SHIFT) - 1;
-
-       pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-       pg = pt_topa_next_entry(buf, pg);
-
-       buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
-       buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
-                       (unsigned long)buf->cur) / sizeof(struct topa_entry);
-       buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
-
-       local64_set(&buf->head, head);
-       local_set(&buf->data_size, 0);
-}
-
-/**
- * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
- * @buf:       PT buffer.
- */
-static void pt_buffer_fini_topa(struct pt_buffer *buf)
-{
-       struct topa *topa, *iter;
-
-       list_for_each_entry_safe(topa, iter, &buf->tables, list) {
-               /*
-                * right now, this is in free_aux() path only, so
-                * no need to unlink this table from the list
-                */
-               topa_free(topa);
-       }
-}
-
-/**
- * pt_buffer_init_topa() - initialize ToPA table for pt buffer
- * @buf:       PT buffer.
- * @size:      Total size of all regions within this ToPA.
- * @gfp:       Allocation flags.
- */
-static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
-                              gfp_t gfp)
-{
-       struct topa *topa;
-       int err;
-
-       topa = topa_alloc(buf->cpu, gfp);
-       if (!topa)
-               return -ENOMEM;
-
-       topa_insert_table(buf, topa);
-
-       while (buf->nr_pages < nr_pages) {
-               err = topa_insert_pages(buf, gfp);
-               if (err) {
-                       pt_buffer_fini_topa(buf);
-                       return -ENOMEM;
-               }
-       }
-
-       pt_buffer_setup_topa_index(buf);
-
-       /* link last table to the first one, unless we're double buffering */
-       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
-               TOPA_ENTRY(buf->last, -1)->end = 1;
-       }
-
-       pt_topa_dump(buf);
-       return 0;
-}
-
-/**
- * pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu:       Cpu on which to allocate, -1 means current.
- * @pages:     Array of pointers to buffer pages passed from perf core.
- * @nr_pages:  Number of pages in the buffer.
- * @snapshot:  If this is a snapshot/overwrite counter.
- *
- * This is a pmu::setup_aux callback that sets up ToPA tables and all the
- * bookkeeping for an AUX buffer.
- *
- * Return:     Our private PT buffer structure.
- */
-static void *
-pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
-{
-       struct pt_buffer *buf;
-       int node, ret;
-
-       if (!nr_pages)
-               return NULL;
-
-       if (cpu == -1)
-               cpu = raw_smp_processor_id();
-       node = cpu_to_node(cpu);
-
-       buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
-                          GFP_KERNEL, node);
-       if (!buf)
-               return NULL;
-
-       buf->cpu = cpu;
-       buf->snapshot = snapshot;
-       buf->data_pages = pages;
-
-       INIT_LIST_HEAD(&buf->tables);
-
-       ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
-       if (ret) {
-               kfree(buf);
-               return NULL;
-       }
-
-       return buf;
-}
-
-/**
- * pt_buffer_free_aux() - perf AUX deallocation path callback
- * @data:      PT buffer.
- */
-static void pt_buffer_free_aux(void *data)
-{
-       struct pt_buffer *buf = data;
-
-       pt_buffer_fini_topa(buf);
-       kfree(buf);
-}
-
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf:       PT buffer.
- * @pt:                Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
-{
-       if (buf->snapshot)
-               return false;
-
-       if (local_read(&buf->data_size) >= pt->handle.size)
-               return true;
-
-       return false;
-}
-
-/**
- * intel_pt_interrupt() - PT PMI handler
- */
-void intel_pt_interrupt(void)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf;
-       struct perf_event *event = pt->handle.event;
-
-       /*
-        * There may be a dangling PT bit in the interrupt status register
-        * after PT has been disabled by pt_event_stop(). Make sure we don't
-        * do anything (particularly, re-enable) for this event here.
-        */
-       if (!ACCESS_ONCE(pt->handle_nmi))
-               return;
-
-       pt_config_start(false);
-
-       if (!event)
-               return;
-
-       buf = perf_get_aux(&pt->handle);
-       if (!buf)
-               return;
-
-       pt_read_offset(buf);
-
-       pt_handle_status(pt);
-
-       pt_update_head(pt);
-
-       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                           local_xchg(&buf->lost, 0));
-
-       if (!event->hw.state) {
-               int ret;
-
-               buf = perf_aux_output_begin(&pt->handle, event);
-               if (!buf) {
-                       event->hw.state = PERF_HES_STOPPED;
-                       return;
-               }
-
-               pt_buffer_reset_offsets(buf, pt->handle.head);
-               /* snapshot counters don't use PMI, so it's safe */
-               ret = pt_buffer_reset_markers(buf, &pt->handle);
-               if (ret) {
-                       perf_aux_output_end(&pt->handle, 0, true);
-                       return;
-               }
-
-               pt_config_buffer(buf->cur->table, buf->cur_idx,
-                                buf->output_off);
-               pt_config(event);
-       }
-}
-
-/*
- * PMU callbacks
- */
-
-static void pt_event_start(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-       if (!buf || pt_buffer_is_full(buf, pt)) {
-               event->hw.state = PERF_HES_STOPPED;
-               return;
-       }
-
-       ACCESS_ONCE(pt->handle_nmi) = 1;
-       event->hw.state = 0;
-
-       pt_config_buffer(buf->cur->table, buf->cur_idx,
-                        buf->output_off);
-       pt_config(event);
-}
-
-static void pt_event_stop(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-       /*
-        * Protect against the PMI racing with disabling wrmsr,
-        * see comment in intel_pt_interrupt().
-        */
-       ACCESS_ONCE(pt->handle_nmi) = 0;
-       pt_config_start(false);
-
-       if (event->hw.state == PERF_HES_STOPPED)
-               return;
-
-       event->hw.state = PERF_HES_STOPPED;
-
-       if (mode & PERF_EF_UPDATE) {
-               struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-               if (!buf)
-                       return;
-
-               if (WARN_ON_ONCE(pt->handle.event != event))
-                       return;
-
-               pt_read_offset(buf);
-
-               pt_handle_status(pt);
-
-               pt_update_head(pt);
-       }
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf;
-
-       pt_event_stop(event, PERF_EF_UPDATE);
-
-       buf = perf_get_aux(&pt->handle);
-
-       if (buf) {
-               if (buf->snapshot)
-                       pt->handle.head =
-                               local_xchg(&buf->data_size,
-                                          buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                                   local_xchg(&buf->lost, 0));
-       }
-}
-
-static int pt_event_add(struct perf_event *event, int mode)
-{
-       struct pt_buffer *buf;
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct hw_perf_event *hwc = &event->hw;
-       int ret = -EBUSY;
-
-       if (pt->handle.event)
-               goto fail;
-
-       buf = perf_aux_output_begin(&pt->handle, event);
-       ret = -EINVAL;
-       if (!buf)
-               goto fail_stop;
-
-       pt_buffer_reset_offsets(buf, pt->handle.head);
-       if (!buf->snapshot) {
-               ret = pt_buffer_reset_markers(buf, &pt->handle);
-               if (ret)
-                       goto fail_end_stop;
-       }
-
-       if (mode & PERF_EF_START) {
-               pt_event_start(event, 0);
-               ret = -EBUSY;
-               if (hwc->state == PERF_HES_STOPPED)
-                       goto fail_end_stop;
-       } else {
-               hwc->state = PERF_HES_STOPPED;
-       }
-
-       return 0;
-
-fail_end_stop:
-       perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
-       hwc->state = PERF_HES_STOPPED;
-fail:
-       return ret;
-}
-
-static void pt_event_read(struct perf_event *event)
-{
-}
-
-static void pt_event_destroy(struct perf_event *event)
-{
-       x86_del_exclusive(x86_lbr_exclusive_pt);
-}
-
-static int pt_event_init(struct perf_event *event)
-{
-       if (event->attr.type != pt_pmu.pmu.type)
-               return -ENOENT;
-
-       if (!pt_event_valid(event))
-               return -EINVAL;
-
-       if (x86_add_exclusive(x86_lbr_exclusive_pt))
-               return -EBUSY;
-
-       event->destroy = pt_event_destroy;
-
-       return 0;
-}
-
-void cpu_emergency_stop_pt(void)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-       if (pt->handle.event)
-               pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
-}
-
-static __init int pt_init(void)
-{
-       int ret, cpu, prior_warn = 0;
-
-       BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
-
-       if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
-               return -ENODEV;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu) {
-               u64 ctl;
-
-               ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
-               if (!ret && (ctl & RTIT_CTL_TRACEEN))
-                       prior_warn++;
-       }
-       put_online_cpus();
-
-       if (prior_warn) {
-               x86_add_exclusive(x86_lbr_exclusive_pt);
-               pr_warn("PT is enabled at boot time, doing nothing\n");
-
-               return -EBUSY;
-       }
-
-       ret = pt_pmu_hw_init();
-       if (ret)
-               return ret;
-
-       if (!pt_cap_get(PT_CAP_topa_output)) {
-               pr_warn("ToPA output is not supported on this CPU\n");
-               return -ENODEV;
-       }
-
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               pt_pmu.pmu.capabilities =
-                       PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
-
-       pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
-       pt_pmu.pmu.attr_groups  = pt_attr_groups;
-       pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
-       pt_pmu.pmu.event_init   = pt_event_init;
-       pt_pmu.pmu.add          = pt_event_add;
-       pt_pmu.pmu.del          = pt_event_del;
-       pt_pmu.pmu.start        = pt_event_start;
-       pt_pmu.pmu.stop         = pt_event_stop;
-       pt_pmu.pmu.read         = pt_event_read;
-       pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
-       pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
-       ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
-
-       return ret;
-}
-arch_initcall(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c

deleted file mode 100644 (file)

index 24a351a..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ /dev/null
@@ -1,783 +0,0 @@
-/*
- * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
- * Copyright (C) 2013 Google, Inc., Stephane Eranian
- *
- * Intel RAPL interface is specified in the IA-32 Manual Vol3b
- * section 14.7.1 (September 2013)
- *
- * RAPL provides more controls than just reporting energy consumption
- * however here we only expose the 3 energy consumption free running
- * counters (pp0, pkg, dram).
- *
- * Each of those counters increments in a power unit defined by the
- * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
- * but it can vary.
- *
- * Counter to rapl events mappings:
- *
- *  pp0 counter: consumption of all physical cores (power plane 0)
- *       event: rapl_energy_cores
- *    perf code: 0x1
- *
- *  pkg counter: consumption of the whole processor package
- *       event: rapl_energy_pkg
- *    perf code: 0x2
- *
- * dram counter: consumption of the dram domain (servers only)
- *       event: rapl_energy_dram
- *    perf code: 0x3
- *
- * dram counter: consumption of the builtin-gpu domain (client only)
- *       event: rapl_energy_gpu
- *    perf code: 0x4
- *
- * We manage those counters as free running (read-only). They may be
- * use simultaneously by other tools, such as turbostat.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it does not make sense and is not
- * supported by the RAPL hardware.
- *
- * Because we want to avoid floating-point operations in the kernel,
- * the events are all reported in fixed point arithmetic (32.32).
- * Tools must adjust the counts to convert them to Watts using
- * the duration of the measurement. Tools may use a function such as
- * ldexp(raw_count, -32);
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-/*
- * RAPL energy status counters
- */
-#define RAPL_IDX_PP0_NRG_STAT  0       /* all cores */
-#define INTEL_RAPL_PP0         0x1     /* pseudo-encoding */
-#define RAPL_IDX_PKG_NRG_STAT  1       /* entire package */
-#define INTEL_RAPL_PKG         0x2     /* pseudo-encoding */
-#define RAPL_IDX_RAM_NRG_STAT  2       /* DRAM */
-#define INTEL_RAPL_RAM         0x3     /* pseudo-encoding */
-#define RAPL_IDX_PP1_NRG_STAT  3       /* gpu */
-#define INTEL_RAPL_PP1         0x4     /* pseudo-encoding */
-
-#define NR_RAPL_DOMAINS         0x4
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
-       "pp0-core",
-       "package",
-       "dram",
-       "pp1-gpu",
-};
-
-/* Clients have PP0, PKG */
-#define RAPL_IDX_CLN   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM */
-#define RAPL_IDX_SRV   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM, PP1 */
-#define RAPL_IDX_HSW   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT|\
-                        1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Knights Landing has PKG, RAM */
-#define RAPL_IDX_KNL   (1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT)
-
-/*
- * event code: LSB 8 bits, passed in attr->config
- * any other bit is reserved
- */
-#define RAPL_EVENT_MASK        0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)          \
-static ssize_t __rapl_##_var##_show(struct kobject *kobj,      \
-                               struct kobj_attribute *attr,    \
-                               char *page)                     \
-{                                                              \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
-       return sprintf(page, _format "\n");                     \
-}                                                              \
-static struct kobj_attribute format_attr_##_var =              \
-       __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-
-#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
-static struct perf_pmu_events_attr event_attr_##v = {                          \
-       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
-       .id             = 0,                                                    \
-       .event_str      = str,                                                  \
-};
-
-struct rapl_pmu {
-       spinlock_t       lock;
-       int              n_active; /* number of active events */
-       struct list_head active_list;
-       struct pmu       *pmu; /* pointer to rapl_pmu_class */
-       ktime_t          timer_interval; /* in ktime_t unit */
-       struct hrtimer   hrtimer;
-};
-
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
-static struct pmu rapl_pmu_class;
-static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
-
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
-
-static struct x86_pmu_quirk *rapl_quirks;
-static inline u64 rapl_read_counter(struct perf_event *event)
-{
-       u64 raw;
-       rdmsrl(event->hw.event_base, raw);
-       return raw;
-}
-
-#define rapl_add_quirk(func_)                                          \
-do {                                                                   \
-       static struct x86_pmu_quirk __quirk __initdata = {              \
-               .func = func_,                                          \
-       };                                                              \
-       __quirk.next = rapl_quirks;                                     \
-       rapl_quirks = &__quirk;                                         \
-} while (0)
-
-static inline u64 rapl_scale(u64 v, int cfg)
-{
-       if (cfg > NR_RAPL_DOMAINS) {
-               pr_warn("invalid domain %d, failed to scale data\n", cfg);
-               return v;
-       }
-       /*
-        * scale delta to smallest unit (1/2^32)
-        * users must then scale back: count * 1/(1e9*2^32) to get Joules
-        * or use ldexp(count, -32).
-        * Watts = Joules/Time delta
-        */
-       return v << (32 - rapl_hw_unit[cfg - 1]);
-}
-
-static u64 rapl_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev_raw_count, new_raw_count;
-       s64 delta, sdelta;
-       int shift = RAPL_CNTR_WIDTH;
-
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       rdmsrl(event->hw.event_base, new_raw_count);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                           new_raw_count) != prev_raw_count) {
-               cpu_relax();
-               goto again;
-       }
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       sdelta = rapl_scale(delta, event->hw.config);
-
-       local64_add(sdelta, &event->count);
-
-       return new_raw_count;
-}
-
-static void rapl_start_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
-                    HRTIMER_MODE_REL_PINNED);
-}
-
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_cancel(&pmu->hrtimer);
-}
-
-static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct perf_event *event;
-       unsigned long flags;
-
-       if (!pmu->n_active)
-               return HRTIMER_NORESTART;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       list_for_each_entry(event, &pmu->active_list, active_entry) {
-               rapl_event_update(event);
-       }
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-
-       hrtimer_forward_now(hrtimer, pmu->timer_interval);
-
-       return HRTIMER_RESTART;
-}
-
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
-{
-       struct hrtimer *hr = &pmu->hrtimer;
-
-       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hr->function = rapl_hrtimer_handle;
-}
-
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
-                                  struct perf_event *event)
-{
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       event->hw.state = 0;
-
-       list_add_tail(&event->active_entry, &pmu->active_list);
-
-       local64_set(&event->hw.prev_count, rapl_read_counter(event));
-
-       pmu->n_active++;
-       if (pmu->n_active == 1)
-               rapl_start_hrtimer(pmu);
-}
-
-static void rapl_pmu_event_start(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-       __rapl_pmu_event_start(pmu, event);
-       spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static void rapl_pmu_event_stop(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       /* mark event as deactivated and stopped */
-       if (!(hwc->state & PERF_HES_STOPPED)) {
-               WARN_ON_ONCE(pmu->n_active <= 0);
-               pmu->n_active--;
-               if (pmu->n_active == 0)
-                       rapl_stop_hrtimer(pmu);
-
-               list_del(&event->active_entry);
-
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       /* check if update of sw counter is necessary */
-       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               rapl_event_update(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static int rapl_pmu_event_add(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       if (mode & PERF_EF_START)
-               __rapl_pmu_event_start(pmu, event);
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-
-       return 0;
-}
-
-static void rapl_pmu_event_del(struct perf_event *event, int flags)
-{
-       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int rapl_pmu_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
-       int bit, msr, ret = 0;
-
-       /* only look at RAPL events */
-       if (event->attr.type != rapl_pmu_class.type)
-               return -ENOENT;
-
-       /* check only supported bits are set */
-       if (event->attr.config & ~RAPL_EVENT_MASK)
-               return -EINVAL;
-
-       /*
-        * check event is known (determines counter)
-        */
-       switch (cfg) {
-       case INTEL_RAPL_PP0:
-               bit = RAPL_IDX_PP0_NRG_STAT;
-               msr = MSR_PP0_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_PKG:
-               bit = RAPL_IDX_PKG_NRG_STAT;
-               msr = MSR_PKG_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_RAM:
-               bit = RAPL_IDX_RAM_NRG_STAT;
-               msr = MSR_DRAM_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_PP1:
-               bit = RAPL_IDX_PP1_NRG_STAT;
-               msr = MSR_PP1_ENERGY_STATUS;
-               break;
-       default:
-               return -EINVAL;
-       }
-       /* check event supported */
-       if (!(rapl_cntr_mask & (1 << bit)))
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       /* must be done before validate_group */
-       event->hw.event_base = msr;
-       event->hw.config = cfg;
-       event->hw.idx = bit;
-
-       return ret;
-}
-
-static void rapl_pmu_event_read(struct perf_event *event)
-{
-       rapl_event_update(event);
-}
-
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
-       .attrs = rapl_pmu_attrs,
-};
-
-RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
-RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
-RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
-
-RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
-
-/*
- * we compute in 0.23 nJ increments regardless of MSR
- */
-RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
-
-static struct attribute *rapl_events_srv_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_cln_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_gpu),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_gpu_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_gpu_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_hsw_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_gpu),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_gpu_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_gpu_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_knl_attr[] = {
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_events_group = {
-       .name = "events",
-       .attrs = NULL, /* patched at runtime */
-};
-
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
-static struct attribute *rapl_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_format_group = {
-       .name = "format",
-       .attrs = rapl_formats_attr,
-};
-
-const struct attribute_group *rapl_attr_groups[] = {
-       &rapl_pmu_attr_group,
-       &rapl_pmu_format_group,
-       &rapl_pmu_events_group,
-       NULL,
-};
-
-static struct pmu rapl_pmu_class = {
-       .attr_groups    = rapl_attr_groups,
-       .task_ctx_nr    = perf_invalid_context, /* system-wide only */
-       .event_init     = rapl_pmu_event_init,
-       .add            = rapl_pmu_event_add, /* must have */
-       .del            = rapl_pmu_event_del, /* must have */
-       .start          = rapl_pmu_event_start,
-       .stop           = rapl_pmu_event_stop,
-       .read           = rapl_pmu_event_read,
-};
-
-static void rapl_cpu_exit(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-       int i, phys_id = topology_physical_package_id(cpu);
-       int target = -1;
-
-       /* find a new cpu on same package */
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-               if (phys_id == topology_physical_package_id(i)) {
-                       target = i;
-                       break;
-               }
-       }
-       /*
-        * clear cpu from cpumask
-        * if was set in cpumask and still some cpu on package,
-        * then move to new cpu
-        */
-       if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-               cpumask_set_cpu(target, &rapl_cpu_mask);
-
-       WARN_ON(cpumask_empty(&rapl_cpu_mask));
-       /*
-        * migrate events and context to new cpu
-        */
-       if (target >= 0)
-               perf_pmu_migrate_context(pmu->pmu, cpu, target);
-
-       /* cancel overflow polling timer for CPU */
-       rapl_stop_hrtimer(pmu);
-}
-
-static void rapl_cpu_init(int cpu)
-{
-       int i, phys_id = topology_physical_package_id(cpu);
-
-       /* check if phys_is is already covered */
-       for_each_cpu(i, &rapl_cpu_mask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return;
-       }
-       /* was not found, so add it */
-       cpumask_set_cpu(cpu, &rapl_cpu_mask);
-}
-
-static __init void rapl_hsw_server_quirk(void)
-{
-       /*
-        * DRAM domain on HSW server has fixed energy unit which can be
-        * different than the unit from power unit MSR.
-        * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-        * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-        */
-       rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_cpu_prepare(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-       int phys_id = topology_physical_package_id(cpu);
-       u64 ms;
-
-       if (pmu)
-               return 0;
-
-       if (phys_id < 0)
-               return -1;
-
-       pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-       if (!pmu)
-               return -1;
-       spin_lock_init(&pmu->lock);
-
-       INIT_LIST_HEAD(&pmu->active_list);
-
-       pmu->pmu = &rapl_pmu_class;
-
-       /*
-        * use reference of 200W for scaling the timeout
-        * to avoid missing counter overflows.
-        * 200W = 200 Joules/sec
-        * divide interval by 2 to avoid lockstep (2 * 100)
-        * if hw unit is 32, then we use 2 ms 1/200/2
-        */
-       if (rapl_hw_unit[0] < 32)
-               ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-       else
-               ms = 2;
-
-       pmu->timer_interval = ms_to_ktime(ms);
-
-       rapl_hrtimer_init(pmu);
-
-       /* set RAPL pmu for this cpu for now */
-       per_cpu(rapl_pmu, cpu) = pmu;
-       per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-       return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-       kfree(pmu);
-
-       per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-       if (!pmu)
-               return 0;
-
-       per_cpu(rapl_pmu, cpu) = NULL;
-
-       per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
-       return 0;
-}
-
-static int rapl_cpu_notifier(struct notifier_block *self,
-                            unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               rapl_cpu_prepare(cpu);
-               break;
-       case CPU_STARTING:
-               rapl_cpu_init(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               rapl_cpu_dying(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               rapl_cpu_kfree(cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               rapl_cpu_exit(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static int rapl_check_hw_unit(void)
-{
-       u64 msr_rapl_power_unit_bits;
-       int i;
-
-       /* protect rdmsrl() to handle virtualization */
-       if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-               return -1;
-       for (i = 0; i < NR_RAPL_DOMAINS; i++)
-               rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
-
-       return 0;
-}
-
-static const struct x86_cpu_id rapl_cpu_match[] = {
-       [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
-       [1] = {},
-};
-
-static int __init rapl_pmu_init(void)
-{
-       struct rapl_pmu *pmu;
-       int cpu, ret;
-       struct x86_pmu_quirk *quirk;
-       int i;
-
-       /*
-        * check for Intel processor family 6
-        */
-       if (!x86_match_cpu(rapl_cpu_match))
-               return 0;
-
-       /* check supported CPU */
-       switch (boot_cpu_data.x86_model) {
-       case 42: /* Sandy Bridge */
-       case 58: /* Ivy Bridge */
-               rapl_cntr_mask = RAPL_IDX_CLN;
-               rapl_pmu_events_group.attrs = rapl_events_cln_attr;
-               break;
-       case 63: /* Haswell-Server */
-               rapl_add_quirk(rapl_hsw_server_quirk);
-               rapl_cntr_mask = RAPL_IDX_SRV;
-               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-               break;
-       case 60: /* Haswell */
-       case 69: /* Haswell-Celeron */
-       case 61: /* Broadwell */
-               rapl_cntr_mask = RAPL_IDX_HSW;
-               rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
-               break;
-       case 45: /* Sandy Bridge-EP */
-       case 62: /* IvyTown */
-               rapl_cntr_mask = RAPL_IDX_SRV;
-               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-               break;
-       case 87: /* Knights Landing */
-               rapl_add_quirk(rapl_hsw_server_quirk);
-               rapl_cntr_mask = RAPL_IDX_KNL;
-               rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
-       default:
-               /* unsupported */
-               return 0;
-       }
-       ret = rapl_check_hw_unit();
-       if (ret)
-               return ret;
-
-       /* run cpu model quirks */
-       for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-               quirk->func();
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               ret = rapl_cpu_prepare(cpu);
-               if (ret)
-                       goto out;
-               rapl_cpu_init(cpu);
-       }
-
-       __perf_cpu_notifier(rapl_cpu_notifier);
-
-       ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-       if (WARN_ON(ret)) {
-               pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-               cpu_notifier_register_done();
-               return -1;
-       }
-
-       pmu = __this_cpu_read(rapl_pmu);
-
-       pr_info("RAPL PMU detected,"
-               " API unit is 2^-32 Joules,"
-               " %d fixed counters"
-               " %llu ms ovfl timer\n",
-               hweight32(rapl_cntr_mask),
-               ktime_to_ms(pmu->timer_interval));
-       for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-               if (rapl_cntr_mask & (1 << i)) {
-                       pr_info("hw unit of domain %s 2^-%d Joules\n",
-                               rapl_domain_names[i], rapl_hw_unit[i]);
-               }
-       }
-out:
-       cpu_notifier_register_done();
-
-       return 0;
-}
-device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c

deleted file mode 100644 (file)

index 3bf41d4..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ /dev/null
@@ -1,1401 +0,0 @@
-#include "perf_event_intel_uncore.h"
-
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
-struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
-struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-
-static bool pcidrv_registered;
-struct pci_driver *uncore_pci_driver;
-/* pci bus to socket mapping */
-DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
-struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-/* mask of cpus that collect uncore events */
-static cpumask_t uncore_cpu_mask;
-
-/* constraint for the fixed counter */
-static struct event_constraint uncore_constraint_fixed =
-       EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-struct event_constraint uncore_constraint_empty =
-       EVENT_CONSTRAINT(0, 0, 0);
-
-int uncore_pcibus_to_physid(struct pci_bus *bus)
-{
-       struct pci2phy_map *map;
-       int phys_id = -1;
-
-       raw_spin_lock(&pci2phy_map_lock);
-       list_for_each_entry(map, &pci2phy_map_head, list) {
-               if (map->segment == pci_domain_nr(bus)) {
-                       phys_id = map->pbus_to_physid[bus->number];
-                       break;
-               }
-       }
-       raw_spin_unlock(&pci2phy_map_lock);
-
-       return phys_id;
-}
-
-struct pci2phy_map *__find_pci2phy_map(int segment)
-{
-       struct pci2phy_map *map, *alloc = NULL;
-       int i;
-
-       lockdep_assert_held(&pci2phy_map_lock);
-
-lookup:
-       list_for_each_entry(map, &pci2phy_map_head, list) {
-               if (map->segment == segment)
-                       goto end;
-       }
-
-       if (!alloc) {
-               raw_spin_unlock(&pci2phy_map_lock);
-               alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
-               raw_spin_lock(&pci2phy_map_lock);
-
-               if (!alloc)
-                       return NULL;
-
-               goto lookup;
-       }
-
-       map = alloc;
-       alloc = NULL;
-       map->segment = segment;
-       for (i = 0; i < 256; i++)
-               map->pbus_to_physid[i] = -1;
-       list_add_tail(&map->list, &pci2phy_map_head);
-
-end:
-       kfree(alloc);
-       return map;
-}
-
-ssize_t uncore_event_show(struct kobject *kobj,
-                         struct kobj_attribute *attr, char *buf)
-{
-       struct uncore_event_desc *event =
-               container_of(attr, struct uncore_event_desc, attr);
-       return sprintf(buf, "%s", event->config);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-       return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
-       struct intel_uncore_box *box;
-
-       box = *per_cpu_ptr(pmu->box, cpu);
-       if (box)
-               return box;
-
-       raw_spin_lock(&uncore_box_lock);
-       /* Recheck in lock to handle races. */
-       if (*per_cpu_ptr(pmu->box, cpu))
-               goto out;
-       list_for_each_entry(box, &pmu->box_list, list) {
-               if (box->phys_id == topology_physical_package_id(cpu)) {
-                       atomic_inc(&box->refcnt);
-                       *per_cpu_ptr(pmu->box, cpu) = box;
-                       break;
-               }
-       }
-out:
-       raw_spin_unlock(&uncore_box_lock);
-
-       return *per_cpu_ptr(pmu->box, cpu);
-}
-
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-       /*
-        * perf core schedules event on the basis of cpu, uncore events are
-        * collected by one of the cpus inside a physical package.
-        */
-       return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       u64 count;
-
-       rdmsrl(event->hw.event_base, count);
-
-       return count;
-}
-
-/*
- * generic get constraint function for shared match/mask registers.
- */
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       unsigned long flags;
-       bool ok = false;
-
-       /*
-        * reg->alloc can be set due to existing state, so for fake box we
-        * need to ignore this, otherwise we might fail to allocate proper
-        * fake state for this extra reg constraint.
-        */
-       if (reg1->idx == EXTRA_REG_NONE ||
-           (!uncore_box_is_fake(box) && reg1->alloc))
-               return NULL;
-
-       er = &box->shared_regs[reg1->idx];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (!atomic_read(&er->ref) ||
-           (er->config1 == reg1->config && er->config2 == reg2->config)) {
-               atomic_inc(&er->ref);
-               er->config1 = reg1->config;
-               er->config2 = reg2->config;
-               ok = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (ok) {
-               if (!uncore_box_is_fake(box))
-                       reg1->alloc = 1;
-               return NULL;
-       }
-
-       return &uncore_constraint_empty;
-}
-
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-
-       /*
-        * Only put constraint if extra reg was actually allocated. Also
-        * takes care of event which do not use an extra shared reg.
-        *
-        * Also, if this is a fake box we shouldn't touch any event state
-        * (reg->alloc) and we don't care about leaving inconsistent box
-        * state either since it will be thrown out.
-        */
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       er = &box->shared_regs[reg1->idx];
-       atomic_dec(&er->ref);
-       reg1->alloc = 0;
-}
-
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       u64 config;
-
-       er = &box->shared_regs[idx];
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       config = er->config;
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       return config;
-}
-
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hwc->idx = idx;
-       hwc->last_tag = ++box->tags[idx];
-
-       if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
-               hwc->event_base = uncore_fixed_ctr(box);
-               hwc->config_base = uncore_fixed_ctl(box);
-               return;
-       }
-
-       hwc->config_base = uncore_event_ctl(box, hwc->idx);
-       hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
-}
-
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
-{
-       u64 prev_count, new_count, delta;
-       int shift;
-
-       if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
-               shift = 64 - uncore_fixed_ctr_bits(box);
-       else
-               shift = 64 - uncore_perf_ctr_bits(box);
-
-       /* the hrtimer might modify the previous event value */
-again:
-       prev_count = local64_read(&event->hw.prev_count);
-       new_count = uncore_read_counter(box, event);
-       if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
-               goto again;
-
-       delta = (new_count << shift) - (prev_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-}
-
-/*
- * The overflow interrupt is unavailable for SandyBridge-EP, is broken
- * for SandyBridge. So we use hrtimer to periodically poll the counter
- * to avoid overflow.
- */
-static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
-{
-       struct intel_uncore_box *box;
-       struct perf_event *event;
-       unsigned long flags;
-       int bit;
-
-       box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
-       if (!box->n_active || box->cpu != smp_processor_id())
-               return HRTIMER_NORESTART;
-       /*
-        * disable local interrupt to prevent uncore_pmu_event_start/stop
-        * to interrupt the update process
-        */
-       local_irq_save(flags);
-
-       /*
-        * handle boxes with an active event list as opposed to active
-        * counters
-        */
-       list_for_each_entry(event, &box->active_list, active_entry) {
-               uncore_perf_event_update(box, event);
-       }
-
-       for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
-               uncore_perf_event_update(box, box->events[bit]);
-
-       local_irq_restore(flags);
-
-       hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
-       return HRTIMER_RESTART;
-}
-
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
-                     HRTIMER_MODE_REL_PINNED);
-}
-
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_cancel(&box->hrtimer);
-}
-
-static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       box->hrtimer.function = uncore_pmu_hrtimer;
-}
-
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
-{
-       struct intel_uncore_box *box;
-       int i, size;
-
-       size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-
-       box = kzalloc_node(size, GFP_KERNEL, node);
-       if (!box)
-               return NULL;
-
-       for (i = 0; i < type->num_shared_regs; i++)
-               raw_spin_lock_init(&box->shared_regs[i].lock);
-
-       uncore_pmu_init_hrtimer(box);
-       atomic_set(&box->refcnt, 1);
-       box->cpu = -1;
-       box->phys_id = -1;
-
-       /* set default hrtimer timeout */
-       box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-
-       INIT_LIST_HEAD(&box->active_list);
-
-       return box;
-}
-
-/*
- * Using uncore_pmu_event_init pmu event_init callback
- * as a detection point for uncore events.
- */
-static int uncore_pmu_event_init(struct perf_event *event);
-
-static bool is_uncore_event(struct perf_event *event)
-{
-       return event->pmu->event_init == uncore_pmu_event_init;
-}
-
-static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
-{
-       struct perf_event *event;
-       int n, max_count;
-
-       max_count = box->pmu->type->num_counters;
-       if (box->pmu->type->fixed_ctl)
-               max_count++;
-
-       if (box->n_events >= max_count)
-               return -EINVAL;
-
-       n = box->n_events;
-
-       if (is_uncore_event(leader)) {
-               box->event_list[n] = leader;
-               n++;
-       }
-
-       if (!dogrp)
-               return n;
-
-       list_for_each_entry(event, &leader->sibling_list, group_entry) {
-               if (!is_uncore_event(event) ||
-                   event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-
-               if (n >= max_count)
-                       return -EINVAL;
-
-               box->event_list[n] = event;
-               n++;
-       }
-       return n;
-}
-
-static struct event_constraint *
-uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_type *type = box->pmu->type;
-       struct event_constraint *c;
-
-       if (type->ops->get_constraint) {
-               c = type->ops->get_constraint(box, event);
-               if (c)
-                       return c;
-       }
-
-       if (event->attr.config == UNCORE_FIXED_EVENT)
-               return &uncore_constraint_fixed;
-
-       if (type->constraints) {
-               for_each_event_constraint(c, type->constraints) {
-                       if ((event->hw.config & c->cmask) == c->code)
-                               return c;
-               }
-       }
-
-       return &type->unconstrainted;
-}
-
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       if (box->pmu->type->ops->put_constraint)
-               box->pmu->type->ops->put_constraint(box, event);
-}
-
-static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
-{
-       unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-       struct event_constraint *c;
-       int i, wmin, wmax, ret = 0;
-       struct hw_perf_event *hwc;
-
-       bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
-
-       for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               c = uncore_get_event_constraint(box, box->event_list[i]);
-               box->event_constraint[i] = c;
-               wmin = min(wmin, c->weight);
-               wmax = max(wmax, c->weight);
-       }
-
-       /* fastpath, try to reuse previous register */
-       for (i = 0; i < n; i++) {
-               hwc = &box->event_list[i]->hw;
-               c = box->event_constraint[i];
-
-               /* never assigned */
-               if (hwc->idx == -1)
-                       break;
-
-               /* constraint still honored */
-               if (!test_bit(hwc->idx, c->idxmsk))
-                       break;
-
-               /* not already used */
-               if (test_bit(hwc->idx, used_mask))
-                       break;
-
-               __set_bit(hwc->idx, used_mask);
-               if (assign)
-                       assign[i] = hwc->idx;
-       }
-       /* slow path */
-       if (i != n)
-               ret = perf_assign_events(box->event_constraint, n,
-                                        wmin, wmax, n, assign);
-
-       if (!assign || ret) {
-               for (i = 0; i < n; i++)
-                       uncore_put_event_constraint(box, box->event_list[i]);
-       }
-       return ret ? -EINVAL : 0;
-}
-
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int idx = event->hw.idx;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
-               return;
-
-       event->hw.state = 0;
-       box->events[idx] = event;
-       box->n_active++;
-       __set_bit(idx, box->active_mask);
-
-       local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
-       uncore_enable_event(box, event);
-
-       if (box->n_active == 1) {
-               uncore_enable_box(box);
-               uncore_pmu_start_hrtimer(box);
-       }
-}
-
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
-               uncore_disable_event(box, event);
-               box->n_active--;
-               box->events[hwc->idx] = NULL;
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-
-               if (box->n_active == 0) {
-                       uncore_disable_box(box);
-                       uncore_pmu_cancel_hrtimer(box);
-               }
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               uncore_perf_event_update(box, event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-       int assign[UNCORE_PMC_IDX_MAX];
-       int i, n, ret;
-
-       if (!box)
-               return -ENODEV;
-
-       ret = n = uncore_collect_events(box, event, false);
-       if (ret < 0)
-               return ret;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       ret = uncore_assign_events(box, assign, n);
-       if (ret)
-               return ret;
-
-       /* save events moving to new counters */
-       for (i = 0; i < box->n_events; i++) {
-               event = box->event_list[i];
-               hwc = &event->hw;
-
-               if (hwc->idx == assign[i] &&
-                       hwc->last_tag == box->tags[assign[i]])
-                       continue;
-               /*
-                * Ensure we don't accidentally enable a stopped
-                * counter simply because we rescheduled.
-                */
-               if (hwc->state & PERF_HES_STOPPED)
-                       hwc->state |= PERF_HES_ARCH;
-
-               uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-       }
-
-       /* reprogram moved events into new counters */
-       for (i = 0; i < n; i++) {
-               event = box->event_list[i];
-               hwc = &event->hw;
-
-               if (hwc->idx != assign[i] ||
-                       hwc->last_tag != box->tags[assign[i]])
-                       uncore_assign_hw_event(box, event, assign[i]);
-               else if (i < box->n_events)
-                       continue;
-
-               if (hwc->state & PERF_HES_ARCH)
-                       continue;
-
-               uncore_pmu_event_start(event, 0);
-       }
-       box->n_events = n;
-
-       return 0;
-}
-
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int i;
-
-       uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < box->n_events; i++) {
-               if (event == box->event_list[i]) {
-                       uncore_put_event_constraint(box, event);
-
-                       while (++i < box->n_events)
-                               box->event_list[i - 1] = box->event_list[i];
-
-                       --box->n_events;
-                       break;
-               }
-       }
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-}
-
-void uncore_pmu_event_read(struct perf_event *event)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       uncore_perf_event_update(box, event);
-}
-
-/*
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int uncore_validate_group(struct intel_uncore_pmu *pmu,
-                               struct perf_event *event)
-{
-       struct perf_event *leader = event->group_leader;
-       struct intel_uncore_box *fake_box;
-       int ret = -EINVAL, n;
-
-       fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
-       if (!fake_box)
-               return -ENOMEM;
-
-       fake_box->pmu = pmu;
-       /*
-        * the event is not yet connected with its
-        * siblings therefore we must first collect
-        * existing siblings, then add the new event
-        * before we can simulate the scheduling
-        */
-       n = uncore_collect_events(fake_box, leader, true);
-       if (n < 0)
-               goto out;
-
-       fake_box->n_events = n;
-       n = uncore_collect_events(fake_box, event, false);
-       if (n < 0)
-               goto out;
-
-       fake_box->n_events = n;
-
-       ret = uncore_assign_events(fake_box, NULL, n);
-out:
-       kfree(fake_box);
-       return ret;
-}
-
-static int uncore_pmu_event_init(struct perf_event *event)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct hw_perf_event *hwc = &event->hw;
-       int ret;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       pmu = uncore_event_to_pmu(event);
-       /* no device found for this pmu */
-       if (pmu->func_id < 0)
-               return -ENOENT;
-
-       /*
-        * Uncore PMU does measure at all privilege level all the time.
-        * So it doesn't make sense to specify any exclude bits.
-        */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-                       event->attr.exclude_hv || event->attr.exclude_idle)
-               return -EINVAL;
-
-       /* Sampling not supported yet */
-       if (hwc->sample_period)
-               return -EINVAL;
-
-       /*
-        * Place all uncore events for a particular physical package
-        * onto a single cpu
-        */
-       if (event->cpu < 0)
-               return -EINVAL;
-       box = uncore_pmu_to_box(pmu, event->cpu);
-       if (!box || box->cpu < 0)
-               return -EINVAL;
-       event->cpu = box->cpu;
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-       if (event->attr.config == UNCORE_FIXED_EVENT) {
-               /* no fixed counter */
-               if (!pmu->type->fixed_ctl)
-                       return -EINVAL;
-               /*
-                * if there is only one fixed counter, only the first pmu
-                * can access the fixed counter
-                */
-               if (pmu->type->single_fixed && pmu->pmu_idx > 0)
-                       return -EINVAL;
-
-               /* fixed counters have event field hardcoded to zero */
-               hwc->config = 0ULL;
-       } else {
-               hwc->config = event->attr.config & pmu->type->event_mask;
-               if (pmu->type->ops->hw_config) {
-                       ret = pmu->type->ops->hw_config(box, event);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       if (event->group_leader != event)
-               ret = uncore_validate_group(pmu, event);
-       else
-               ret = 0;
-
-       return ret;
-}
-
-static ssize_t uncore_get_attr_cpumask(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
-
-static struct attribute *uncore_pmu_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group uncore_pmu_attr_group = {
-       .attrs = uncore_pmu_attrs,
-};
-
-static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
-{
-       int ret;
-
-       if (!pmu->type->pmu) {
-               pmu->pmu = (struct pmu) {
-                       .attr_groups    = pmu->type->attr_groups,
-                       .task_ctx_nr    = perf_invalid_context,
-                       .event_init     = uncore_pmu_event_init,
-                       .add            = uncore_pmu_event_add,
-                       .del            = uncore_pmu_event_del,
-                       .start          = uncore_pmu_event_start,
-                       .stop           = uncore_pmu_event_stop,
-                       .read           = uncore_pmu_event_read,
-               };
-       } else {
-               pmu->pmu = *pmu->type->pmu;
-               pmu->pmu.attr_groups = pmu->type->attr_groups;
-       }
-
-       if (pmu->type->num_boxes == 1) {
-               if (strlen(pmu->type->name) > 0)
-                       sprintf(pmu->name, "uncore_%s", pmu->type->name);
-               else
-                       sprintf(pmu->name, "uncore");
-       } else {
-               sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-                       pmu->pmu_idx);
-       }
-
-       ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
-       return ret;
-}
-
-static void __init uncore_type_exit(struct intel_uncore_type *type)
-{
-       int i;
-
-       for (i = 0; i < type->num_boxes; i++)
-               free_percpu(type->pmus[i].box);
-       kfree(type->pmus);
-       type->pmus = NULL;
-       kfree(type->events_group);
-       type->events_group = NULL;
-}
-
-static void __init uncore_types_exit(struct intel_uncore_type **types)
-{
-       int i;
-       for (i = 0; types[i]; i++)
-               uncore_type_exit(types[i]);
-}
-
-static int __init uncore_type_init(struct intel_uncore_type *type)
-{
-       struct intel_uncore_pmu *pmus;
-       struct attribute_group *attr_group;
-       struct attribute **attrs;
-       int i, j;
-
-       pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
-       if (!pmus)
-               return -ENOMEM;
-
-       type->pmus = pmus;
-
-       type->unconstrainted = (struct event_constraint)
-               __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-                               0, type->num_counters, 0, 0);
-
-       for (i = 0; i < type->num_boxes; i++) {
-               pmus[i].func_id = -1;
-               pmus[i].pmu_idx = i;
-               pmus[i].type = type;
-               INIT_LIST_HEAD(&pmus[i].box_list);
-               pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-               if (!pmus[i].box)
-                       goto fail;
-       }
-
-       if (type->event_descs) {
-               i = 0;
-               while (type->event_descs[i].attr.attr.name)
-                       i++;
-
-               attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
-                                       sizeof(*attr_group), GFP_KERNEL);
-               if (!attr_group)
-                       goto fail;
-
-               attrs = (struct attribute **)(attr_group + 1);
-               attr_group->name = "events";
-               attr_group->attrs = attrs;
-
-               for (j = 0; j < i; j++)
-                       attrs[j] = &type->event_descs[j].attr.attr;
-
-               type->events_group = attr_group;
-       }
-
-       type->pmu_group = &uncore_pmu_attr_group;
-       return 0;
-fail:
-       uncore_type_exit(type);
-       return -ENOMEM;
-}
-
-static int __init uncore_types_init(struct intel_uncore_type **types)
-{
-       int i, ret;
-
-       for (i = 0; types[i]; i++) {
-               ret = uncore_type_init(types[i]);
-               if (ret)
-                       goto fail;
-       }
-       return 0;
-fail:
-       while (--i >= 0)
-               uncore_type_exit(types[i]);
-       return ret;
-}
-
-/*
- * add a pci uncore device
- */
-static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct intel_uncore_type *type;
-       int phys_id;
-       bool first_box = false;
-
-       phys_id = uncore_pcibus_to_physid(pdev->bus);
-       if (phys_id < 0)
-               return -ENODEV;
-
-       if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-               int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-               uncore_extra_pci_dev[phys_id][idx] = pdev;
-               pci_set_drvdata(pdev, NULL);
-               return 0;
-       }
-
-       type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-       box = uncore_alloc_box(type, NUMA_NO_NODE);
-       if (!box)
-               return -ENOMEM;
-
-       /*
-        * for performance monitoring unit with multiple boxes,
-        * each box has a different function id.
-        */
-       pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
-       /* Knights Landing uses a common PCI device ID for multiple instances of
-        * an uncore PMU device type. There is only one entry per device type in
-        * the knl_uncore_pci_ids table inspite of multiple devices present for
-        * some device types. Hence PCI device idx would be 0 for all devices.
-        * So increment pmu pointer to point to an unused array element.
-        */
-       if (boot_cpu_data.x86_model == 87)
-               while (pmu->func_id >= 0)
-                       pmu++;
-       if (pmu->func_id < 0)
-               pmu->func_id = pdev->devfn;
-       else
-               WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-       box->phys_id = phys_id;
-       box->pci_dev = pdev;
-       box->pmu = pmu;
-       uncore_box_init(box);
-       pci_set_drvdata(pdev, box);
-
-       raw_spin_lock(&uncore_box_lock);
-       if (list_empty(&pmu->box_list))
-               first_box = true;
-       list_add_tail(&box->list, &pmu->box_list);
-       raw_spin_unlock(&uncore_box_lock);
-
-       if (first_box)
-               uncore_pmu_register(pmu);
-       return 0;
-}
-
-static void uncore_pci_remove(struct pci_dev *pdev)
-{
-       struct intel_uncore_box *box = pci_get_drvdata(pdev);
-       struct intel_uncore_pmu *pmu;
-       int i, cpu, phys_id;
-       bool last_box = false;
-
-       phys_id = uncore_pcibus_to_physid(pdev->bus);
-       box = pci_get_drvdata(pdev);
-       if (!box) {
-               for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-                       if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-                               uncore_extra_pci_dev[phys_id][i] = NULL;
-                               break;
-                       }
-               }
-               WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
-               return;
-       }
-
-       pmu = box->pmu;
-       if (WARN_ON_ONCE(phys_id != box->phys_id))
-               return;
-
-       pci_set_drvdata(pdev, NULL);
-
-       raw_spin_lock(&uncore_box_lock);
-       list_del(&box->list);
-       if (list_empty(&pmu->box_list))
-               last_box = true;
-       raw_spin_unlock(&uncore_box_lock);
-
-       for_each_possible_cpu(cpu) {
-               if (*per_cpu_ptr(pmu->box, cpu) == box) {
-                       *per_cpu_ptr(pmu->box, cpu) = NULL;
-                       atomic_dec(&box->refcnt);
-               }
-       }
-
-       WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
-       kfree(box);
-
-       if (last_box)
-               perf_pmu_unregister(&pmu->pmu);
-}
-
-static int __init uncore_pci_init(void)
-{
-       int ret;
-
-       switch (boot_cpu_data.x86_model) {
-       case 45: /* Sandy Bridge-EP */
-               ret = snbep_uncore_pci_init();
-               break;
-       case 62: /* Ivy Bridge-EP */
-               ret = ivbep_uncore_pci_init();
-               break;
-       case 63: /* Haswell-EP */
-               ret = hswep_uncore_pci_init();
-               break;
-       case 79: /* BDX-EP */
-       case 86: /* BDX-DE */
-               ret = bdx_uncore_pci_init();
-               break;
-       case 42: /* Sandy Bridge */
-               ret = snb_uncore_pci_init();
-               break;
-       case 58: /* Ivy Bridge */
-               ret = ivb_uncore_pci_init();
-               break;
-       case 60: /* Haswell */
-       case 69: /* Haswell Celeron */
-               ret = hsw_uncore_pci_init();
-               break;
-       case 61: /* Broadwell */
-               ret = bdw_uncore_pci_init();
-               break;
-       case 87: /* Knights Landing */
-               ret = knl_uncore_pci_init();
-               break;
-       case 94: /* SkyLake */
-               ret = skl_uncore_pci_init();
-               break;
-       default:
-               return 0;
-       }
-
-       if (ret)
-               return ret;
-
-       ret = uncore_types_init(uncore_pci_uncores);
-       if (ret)
-               return ret;
-
-       uncore_pci_driver->probe = uncore_pci_probe;
-       uncore_pci_driver->remove = uncore_pci_remove;
-
-       ret = pci_register_driver(uncore_pci_driver);
-       if (ret == 0)
-               pcidrv_registered = true;
-       else
-               uncore_types_exit(uncore_pci_uncores);
-
-       return ret;
-}
-
-static void __init uncore_pci_exit(void)
-{
-       if (pcidrv_registered) {
-               pcidrv_registered = false;
-               pci_unregister_driver(uncore_pci_driver);
-               uncore_types_exit(uncore_pci_uncores);
-       }
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-       struct intel_uncore_box *box;
-
-       while (!list_empty(&boxes_to_free)) {
-               box = list_entry(boxes_to_free.next,
-                                struct intel_uncore_box, list);
-               list_del(&box->list);
-               kfree(box);
-       }
-}
-
-static void uncore_cpu_dying(int cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       box = *per_cpu_ptr(pmu->box, cpu);
-                       *per_cpu_ptr(pmu->box, cpu) = NULL;
-                       if (box && atomic_dec_and_test(&box->refcnt))
-                               list_add(&box->list, &boxes_to_free);
-               }
-       }
-}
-
-static int uncore_cpu_starting(int cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box, *exist;
-       int i, j, k, phys_id;
-
-       phys_id = topology_physical_package_id(cpu);
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       box = *per_cpu_ptr(pmu->box, cpu);
-                       /* called by uncore_cpu_init? */
-                       if (box && box->phys_id >= 0) {
-                               uncore_box_init(box);
-                               continue;
-                       }
-
-                       for_each_online_cpu(k) {
-                               exist = *per_cpu_ptr(pmu->box, k);
-                               if (exist && exist->phys_id == phys_id) {
-                                       atomic_inc(&exist->refcnt);
-                                       *per_cpu_ptr(pmu->box, cpu) = exist;
-                                       if (box) {
-                                               list_add(&box->list,
-                                                        &boxes_to_free);
-                                               box = NULL;
-                                       }
-                                       break;
-                               }
-                       }
-
-                       if (box) {
-                               box->phys_id = phys_id;
-                               uncore_box_init(box);
-                       }
-               }
-       }
-       return 0;
-}
-
-static int uncore_cpu_prepare(int cpu, int phys_id)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       if (pmu->func_id < 0)
-                               pmu->func_id = j;
-
-                       box = uncore_alloc_box(type, cpu_to_node(cpu));
-                       if (!box)
-                               return -ENOMEM;
-
-                       box->pmu = pmu;
-                       box->phys_id = phys_id;
-                       *per_cpu_ptr(pmu->box, cpu) = box;
-               }
-       }
-       return 0;
-}
-
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncores[i]; i++) {
-               type = uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       if (old_cpu < 0)
-                               box = uncore_pmu_to_box(pmu, new_cpu);
-                       else
-                               box = uncore_pmu_to_box(pmu, old_cpu);
-                       if (!box)
-                               continue;
-
-                       if (old_cpu < 0) {
-                               WARN_ON_ONCE(box->cpu != -1);
-                               box->cpu = new_cpu;
-                               continue;
-                       }
-
-                       WARN_ON_ONCE(box->cpu != old_cpu);
-                       if (new_cpu >= 0) {
-                               uncore_pmu_cancel_hrtimer(box);
-                               perf_pmu_migrate_context(&pmu->pmu,
-                                               old_cpu, new_cpu);
-                               box->cpu = new_cpu;
-                       } else {
-                               box->cpu = -1;
-                       }
-               }
-       }
-}
-
-static void uncore_event_exit_cpu(int cpu)
-{
-       int i, phys_id, target;
-
-       /* if exiting cpu is used for collecting uncore events */
-       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
-               return;
-
-       /* find a new cpu to collect uncore events */
-       phys_id = topology_physical_package_id(cpu);
-       target = -1;
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-               if (phys_id == topology_physical_package_id(i)) {
-                       target = i;
-                       break;
-               }
-       }
-
-       /* migrate uncore events to the new cpu */
-       if (target >= 0)
-               cpumask_set_cpu(target, &uncore_cpu_mask);
-
-       uncore_change_context(uncore_msr_uncores, cpu, target);
-       uncore_change_context(uncore_pci_uncores, cpu, target);
-}
-
-static void uncore_event_init_cpu(int cpu)
-{
-       int i, phys_id;
-
-       phys_id = topology_physical_package_id(cpu);
-       for_each_cpu(i, &uncore_cpu_mask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return;
-       }
-
-       cpumask_set_cpu(cpu, &uncore_cpu_mask);
-
-       uncore_change_context(uncore_msr_uncores, -1, cpu);
-       uncore_change_context(uncore_pci_uncores, -1, cpu);
-}
-
-static int uncore_cpu_notifier(struct notifier_block *self,
-                              unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       /* allocate/free data structure for uncore box */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               uncore_cpu_prepare(cpu, -1);
-               break;
-       case CPU_STARTING:
-               uncore_cpu_starting(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               uncore_cpu_dying(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               uncore_kfree_boxes();
-               break;
-       default:
-               break;
-       }
-
-       /* select the cpu that collects uncore events */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_FAILED:
-       case CPU_STARTING:
-               uncore_event_init_cpu(cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               uncore_event_exit_cpu(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block uncore_cpu_nb = {
-       .notifier_call  = uncore_cpu_notifier,
-       /*
-        * to migrate uncore events, our notifier should be executed
-        * before perf core's notifier.
-        */
-       .priority       = CPU_PRI_PERF + 1,
-};
-
-static void __init uncore_cpu_setup(void *dummy)
-{
-       uncore_cpu_starting(smp_processor_id());
-}
-
-static int __init uncore_cpu_init(void)
-{
-       int ret;
-
-       switch (boot_cpu_data.x86_model) {
-       case 26: /* Nehalem */
-       case 30:
-       case 37: /* Westmere */
-       case 44:
-               nhm_uncore_cpu_init();
-               break;
-       case 42: /* Sandy Bridge */
-       case 58: /* Ivy Bridge */
-       case 60: /* Haswell */
-       case 69: /* Haswell */
-       case 70: /* Haswell */
-       case 61: /* Broadwell */
-       case 71: /* Broadwell */
-               snb_uncore_cpu_init();
-               break;
-       case 45: /* Sandy Bridge-EP */
-               snbep_uncore_cpu_init();
-               break;
-       case 46: /* Nehalem-EX */
-       case 47: /* Westmere-EX aka. Xeon E7 */
-               nhmex_uncore_cpu_init();
-               break;
-       case 62: /* Ivy Bridge-EP */
-               ivbep_uncore_cpu_init();
-               break;
-       case 63: /* Haswell-EP */
-               hswep_uncore_cpu_init();
-               break;
-       case 79: /* BDX-EP */
-       case 86: /* BDX-DE */
-               bdx_uncore_cpu_init();
-               break;
-       case 87: /* Knights Landing */
-               knl_uncore_cpu_init();
-               break;
-       default:
-               return 0;
-       }
-
-       ret = uncore_types_init(uncore_msr_uncores);
-       if (ret)
-               return ret;
-
-       return 0;
-}
-
-static int __init uncore_pmus_register(void)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_type *type;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       uncore_pmu_register(pmu);
-               }
-       }
-
-       return 0;
-}
-
-static void __init uncore_cpumask_init(void)
-{
-       int cpu;
-
-       /*
-        * ony invoke once from msr or pci init code
-        */
-       if (!cpumask_empty(&uncore_cpu_mask))
-               return;
-
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               int i, phys_id = topology_physical_package_id(cpu);
-
-               for_each_cpu(i, &uncore_cpu_mask) {
-                       if (phys_id == topology_physical_package_id(i)) {
-                               phys_id = -1;
-                               break;
-                       }
-               }
-               if (phys_id < 0)
-                       continue;
-
-               uncore_cpu_prepare(cpu, phys_id);
-               uncore_event_init_cpu(cpu);
-       }
-       on_each_cpu(uncore_cpu_setup, NULL, 1);
-
-       __register_cpu_notifier(&uncore_cpu_nb);
-
-       cpu_notifier_register_done();
-}
-
-
-static int __init intel_uncore_init(void)
-{
-       int ret;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-               return -ENODEV;
-
-       if (cpu_has_hypervisor)
-               return -ENODEV;
-
-       ret = uncore_pci_init();
-       if (ret)
-               goto fail;
-       ret = uncore_cpu_init();
-       if (ret) {
-               uncore_pci_exit();
-               goto fail;
-       }
-       uncore_cpumask_init();
-
-       uncore_pmus_register();
-       return 0;
-fail:
-       return ret;
-}
-device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h

deleted file mode 100644 (file)

index a7086b8..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/perf_event.h>
-#include "perf_event.h"
-
-#define UNCORE_PMU_NAME_LEN            32
-#define UNCORE_PMU_HRTIMER_INTERVAL    (60LL * NSEC_PER_SEC)
-#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
-
-#define UNCORE_FIXED_EVENT             0xff
-#define UNCORE_PMC_IDX_MAX_GENERIC     8
-#define UNCORE_PMC_IDX_FIXED           UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX             (UNCORE_PMC_IDX_FIXED + 1)
-
-#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
-#define UNCORE_PCI_DEV_TYPE(data)      ((data >> 8) & 0xff)
-#define UNCORE_PCI_DEV_IDX(data)       (data & 0xff)
-#define UNCORE_EXTRA_PCI_DEV           0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX       3
-
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX              8
-
-#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-
-struct intel_uncore_ops;
-struct intel_uncore_pmu;
-struct intel_uncore_box;
-struct uncore_event_desc;
-
-struct intel_uncore_type {
-       const char *name;
-       int num_counters;
-       int num_boxes;
-       int perf_ctr_bits;
-       int fixed_ctr_bits;
-       unsigned perf_ctr;
-       unsigned event_ctl;
-       unsigned event_mask;
-       unsigned fixed_ctr;
-       unsigned fixed_ctl;
-       unsigned box_ctl;
-       unsigned msr_offset;
-       unsigned num_shared_regs:8;
-       unsigned single_fixed:1;
-       unsigned pair_ctr_ctl:1;
-       unsigned *msr_offsets;
-       struct event_constraint unconstrainted;
-       struct event_constraint *constraints;
-       struct intel_uncore_pmu *pmus;
-       struct intel_uncore_ops *ops;
-       struct uncore_event_desc *event_descs;
-       const struct attribute_group *attr_groups[4];
-       struct pmu *pmu; /* for custom pmu ops */
-};
-
-#define pmu_group attr_groups[0]
-#define format_group attr_groups[1]
-#define events_group attr_groups[2]
-
-struct intel_uncore_ops {
-       void (*init_box)(struct intel_uncore_box *);
-       void (*disable_box)(struct intel_uncore_box *);
-       void (*enable_box)(struct intel_uncore_box *);
-       void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
-       void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
-       u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
-       int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
-       struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
-                                                  struct perf_event *);
-       void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
-};
-
-struct intel_uncore_pmu {
-       struct pmu pmu;
-       char name[UNCORE_PMU_NAME_LEN];
-       int pmu_idx;
-       int func_id;
-       struct intel_uncore_type *type;
-       struct intel_uncore_box ** __percpu box;
-       struct list_head box_list;
-};
-
-struct intel_uncore_extra_reg {
-       raw_spinlock_t lock;
-       u64 config, config1, config2;
-       atomic_t ref;
-};
-
-struct intel_uncore_box {
-       int phys_id;
-       int n_active;   /* number of active events */
-       int n_events;
-       int cpu;        /* cpu to collect events */
-       unsigned long flags;
-       atomic_t refcnt;
-       struct perf_event *events[UNCORE_PMC_IDX_MAX];
-       struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
-       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
-       unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-       u64 tags[UNCORE_PMC_IDX_MAX];
-       struct pci_dev *pci_dev;
-       struct intel_uncore_pmu *pmu;
-       u64 hrtimer_duration; /* hrtimer timeout for this box */
-       struct hrtimer hrtimer;
-       struct list_head list;
-       struct list_head active_list;
-       void *io_addr;
-       struct intel_uncore_extra_reg shared_regs[0];
-};
-
-#define UNCORE_BOX_FLAG_INITIATED      0
-
-struct uncore_event_desc {
-       struct kobj_attribute attr;
-       const char *config;
-};
-
-struct pci2phy_map {
-       struct list_head list;
-       int segment;
-       int pbus_to_physid[256];
-};
-
-int uncore_pcibus_to_physid(struct pci_bus *bus);
-struct pci2phy_map *__find_pci2phy_map(int segment);
-
-ssize_t uncore_event_show(struct kobject *kobj,
-                         struct kobj_attribute *attr, char *buf);
-
-#define INTEL_UNCORE_EVENT_DESC(_name, _config)                        \
-{                                                              \
-       .attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
-       .config = _config,                                      \
-}
-
-#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)                        \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj,            \
-                               struct kobj_attribute *attr,            \
-                               char *page)                             \
-{                                                                      \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
-       return sprintf(page, _format "\n");                             \
-}                                                                      \
-static struct kobj_attribute format_attr_##_var =                      \
-       __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
-{
-       return box->pmu->type->box_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr;
-}
-
-static inline
-unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       return idx * 4 + box->pmu->type->event_ctl;
-}
-
-static inline
-unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       return idx * 8 + box->pmu->type->perf_ctr;
-}
-
-static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
-{
-       struct intel_uncore_pmu *pmu = box->pmu;
-       return pmu->type->msr_offsets ?
-               pmu->type->msr_offsets[pmu->pmu_idx] :
-               pmu->type->msr_offset * pmu->pmu_idx;
-}
-
-static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
-{
-       if (!box->pmu->type->box_ctl)
-               return 0;
-       return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
-{
-       if (!box->pmu->type->fixed_ctl)
-               return 0;
-       return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       return box->pmu->type->event_ctl +
-               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-               uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       return box->pmu->type->perf_ctr +
-               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-               uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
-{
-       if (box->pci_dev)
-               return uncore_pci_fixed_ctl(box);
-       else
-               return uncore_msr_fixed_ctl(box);
-}
-
-static inline
-unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
-{
-       if (box->pci_dev)
-               return uncore_pci_fixed_ctr(box);
-       else
-               return uncore_msr_fixed_ctr(box);
-}
-
-static inline
-unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       if (box->pci_dev)
-               return uncore_pci_event_ctl(box, idx);
-       else
-               return uncore_msr_event_ctl(box, idx);
-}
-
-static inline
-unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       if (box->pci_dev)
-               return uncore_pci_perf_ctr(box, idx);
-       else
-               return uncore_msr_perf_ctr(box, idx);
-}
-
-static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
-{
-       return box->pmu->type->perf_ctr_bits;
-}
-
-static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr_bits;
-}
-
-static inline int uncore_num_counters(struct intel_uncore_box *box)
-{
-       return box->pmu->type->num_counters;
-}
-
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->type->ops->disable_box)
-               box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->type->ops->enable_box)
-               box->pmu->type->ops->enable_box(box);
-}
-
-static inline void uncore_disable_event(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       box->pmu->type->ops->disable_event(box, event);
-}
-
-static inline void uncore_enable_event(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       box->pmu->type->ops->enable_event(box, event);
-}
-
-static inline u64 uncore_read_counter(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       return box->pmu->type->ops->read_counter(box, event);
-}
-
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-               if (box->pmu->type->ops->init_box)
-                       box->pmu->type->ops->init_box(box);
-       }
-}
-
-static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
-{
-       return (box->phys_id < 0);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_event_read(struct perf_event *event);
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
-
-extern struct intel_uncore_type **uncore_msr_uncores;
-extern struct intel_uncore_type **uncore_pci_uncores;
-extern struct pci_driver *uncore_pci_driver;
-extern raw_spinlock_t pci2phy_map_lock;
-extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-extern struct event_constraint uncore_constraint_empty;
-
-/* perf_event_intel_uncore_snb.c */
-int snb_uncore_pci_init(void);
-int ivb_uncore_pci_init(void);
-int hsw_uncore_pci_init(void);
-int bdw_uncore_pci_init(void);
-int skl_uncore_pci_init(void);
-void snb_uncore_cpu_init(void);
-void nhm_uncore_cpu_init(void);
-int snb_pci2phy_map_init(int devid);
-
-/* perf_event_intel_uncore_snbep.c */
-int snbep_uncore_pci_init(void);
-void snbep_uncore_cpu_init(void);
-int ivbep_uncore_pci_init(void);
-void ivbep_uncore_cpu_init(void);
-int hswep_uncore_pci_init(void);
-void hswep_uncore_cpu_init(void);
-int bdx_uncore_pci_init(void);
-void bdx_uncore_cpu_init(void);
-int knl_uncore_pci_init(void);
-void knl_uncore_cpu_init(void);
-
-/* perf_event_intel_uncore_nhmex.c */
-void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c

deleted file mode 100644 (file)

index 2749965..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ /dev/null
@@ -1,1221 +0,0 @@
-/* Nehalem-EX/Westmere-EX uncore support */
-#include "perf_event_intel_uncore.h"
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK     0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK      0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0         (1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET                (1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN          (1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22                (1 << 22)
-#define NHMEX_PMON_CTL_INVERT          (1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK      0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK      (NHMEX_PMON_CTL_EV_SEL_MASK | \
-                                        NHMEX_PMON_CTL_UMASK_MASK | \
-                                        NHMEX_PMON_CTL_EDGE_DET | \
-                                        NHMEX_PMON_CTL_INVERT | \
-                                        NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL            0xc00
-#define NHMEX_U_MSR_PMON_CTR                   0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL                        0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN                 (1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL       0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL             (1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL            (1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK            \
-               (NHMEX_PMON_CTL_EV_SEL_MASK |   \
-                NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL           0xd00
-#define NHMEX_C0_MSR_PMON_CTR0                 0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0              0xd10
-#define NHMEX_C_MSR_OFFSET                     0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL           0xc20
-#define NHMEX_B0_MSR_PMON_CTR0                 0xc31
-#define NHMEX_B0_MSR_PMON_CTL0                 0xc30
-#define NHMEX_B_MSR_OFFSET                     0x40
-#define NHMEX_B0_MSR_MATCH                     0xe45
-#define NHMEX_B0_MSR_MASK                      0xe46
-#define NHMEX_B1_MSR_MATCH                     0xe4d
-#define NHMEX_B1_MSR_MASK                      0xe4e
-
-#define NHMEX_B_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT          1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK           \
-               (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT         6
-#define NHMEX_B_PMON_CTR_MASK          \
-               (0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK            \
-               (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
-                NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL           0xc40
-#define NHMEX_S0_MSR_PMON_CTR0                 0xc51
-#define NHMEX_S0_MSR_PMON_CTL0                 0xc50
-#define NHMEX_S_MSR_OFFSET                     0x80
-#define NHMEX_S0_MSR_MM_CFG                    0xe48
-#define NHMEX_S0_MSR_MATCH                     0xe49
-#define NHMEX_S0_MSR_MASK                      0xe4a
-#define NHMEX_S1_MSR_MM_CFG                    0xe58
-#define NHMEX_S1_MSR_MATCH                     0xe59
-#define NHMEX_S1_MSR_MASK                      0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN                 (0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV             0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL                        0xca0
-#define NHMEX_M0_MSR_PMU_DSP                   0xca5
-#define NHMEX_M0_MSR_PMU_ISS                   0xca6
-#define NHMEX_M0_MSR_PMU_MAP                   0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR               0xca8
-#define NHMEX_M0_MSR_PMU_PGT                   0xca9
-#define NHMEX_M0_MSR_PMU_PLD                   0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC           0xcab
-#define NHMEX_M0_MSR_PMU_CTL0                  0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0                  0xcb1
-#define NHMEX_M_MSR_OFFSET                     0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG                        0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG                        0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN                 (1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK           0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK            0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT           34
-
-#define NHMEX_M_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN                        (1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT      2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK       \
-       (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT    4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK     \
-       (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE             (1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE             (1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT         9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK          \
-       (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT    19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK     \
-       (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK                    \
-               (NHMEX_M_PMON_CTL_COUNT_MODE_MASK |     \
-                NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |   \
-                NHMEX_M_PMON_CTL_WRAP_MODE |           \
-                NHMEX_M_PMON_CTL_FLAG_MODE |           \
-                NHMEX_M_PMON_CTL_INC_SEL_MASK |        \
-                NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
-                               NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
-                          NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
-                               NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
-               EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-                               MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
-               EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-                               MBOX_SET_FLAG_SEL_MASK, \
-                               (u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL                 0xe00
-#define NHMEX_R_MSR_PMON_CTL0                  0xe10
-#define NHMEX_R_MSR_PMON_CNT0                  0xe11
-#define NHMEX_R_MSR_OFFSET                     0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n)           \
-               ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)                (0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)                (0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)                \
-               (((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)   \
-               (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)    \
-               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)     \
-               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)   \
-               (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)    \
-               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)     \
-               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT          1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK           \
-               (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN                        (1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK            NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL                 0xc80
-#define NHMEX_W_MSR_PMON_CNT0                  0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0              0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR             0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL             0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN           (1ULL << 31)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-                               ((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       u64 config;
-
-       if (msr) {
-               rdmsrl(msr, config);
-               config &= ~((1ULL << uncore_num_counters(box)) - 1);
-               /* WBox has a fixed counter */
-               if (uncore_msr_fixed_ctl(box))
-                       config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
-               wrmsrl(msr, config);
-       }
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       u64 config;
-
-       if (msr) {
-               rdmsrl(msr, config);
-               config |= (1ULL << uncore_num_counters(box)) - 1;
-               /* WBox has a fixed counter */
-               if (uncore_msr_fixed_ctl(box))
-                       config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
-               wrmsrl(msr, config);
-       }
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
-       else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
-               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-       else
-               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT()                         \
-       .init_box       = nhmex_uncore_msr_init_box,            \
-       .disable_box    = nhmex_uncore_msr_disable_box,         \
-       .enable_box     = nhmex_uncore_msr_enable_box,          \
-       .disable_event  = nhmex_uncore_msr_disable_event,       \
-       .read_counter   = uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event   = nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_edge.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
-       .name           = "format",
-       .attrs          = nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 1,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .event_ctl      = NHMEX_U_MSR_PMON_EV_SEL,
-       .perf_ctr       = NHMEX_U_MSR_PMON_CTR,
-       .event_mask     = NHMEX_U_PMON_RAW_EVENT_MASK,
-       .box_ctl        = NHMEX_U_MSR_PMON_GLOBAL_CTL,
-       .ops            = &nhmex_uncore_ops,
-       .format_group   = &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
-       0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 6,
-       .num_boxes              = 10,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_C0_MSR_PMON_EV_SEL0,
-       .perf_ctr               = NHMEX_C0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
-       .msr_offsets            = nhmex_cbox_msr_offsets,
-       .pair_ctr_ctl           = 1,
-       .ops                    = &nhmex_uncore_ops,
-       .format_group           = &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
-       .name                   = "wbox",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_W_MSR_PMON_CNT0,
-       .perf_ctr               = NHMEX_W_MSR_PMON_EVT_SEL0,
-       .fixed_ctr              = NHMEX_W_MSR_PMON_FIXED_CTR,
-       .fixed_ctl              = NHMEX_W_MSR_PMON_FIXED_CTL,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_W_MSR_GLOBAL_CTL,
-       .pair_ctr_ctl           = 1,
-       .event_descs            = nhmex_uncore_wbox_events,
-       .ops                    = &nhmex_uncore_ops,
-       .format_group           = &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int ctr, ev_sel;
-
-       ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
-               NHMEX_B_PMON_CTR_SHIFT;
-       ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
-                 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
-       /* events that do not use the match/mask registers */
-       if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
-           (ctr == 2 && ev_sel != 0x4) || ctr == 3)
-               return 0;
-
-       if (box->pmu->pmu_idx == 0)
-               reg1->reg = NHMEX_B0_MSR_MATCH;
-       else
-               reg1->reg = NHMEX_B1_MSR_MATCH;
-       reg1->idx = 0;
-       reg1->config = event->attr.config1;
-       reg2->config = event->attr.config2;
-       return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg1->reg, reg1->config);
-               wrmsrl(reg1->reg + 1, reg2->config);
-       }
-       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-               (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
-       EVENT_CONSTRAINT(0 , 1, 0xc0),
-       EVENT_CONSTRAINT(0x40, 2, 0xc0),
-       EVENT_CONSTRAINT(0x80, 4, 0xc0),
-       EVENT_CONSTRAINT(0xc0, 8, 0xc0),
-       EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
-       &format_attr_event5.attr,
-       &format_attr_counter.attr,
-       &format_attr_match.attr,
-       &format_attr_mask.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_bbox_msr_enable_event,
-       .hw_config              = nhmex_bbox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
-       .name                   = "bbox",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_B0_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_B0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_B_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
-       .msr_offset             = NHMEX_B_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 1,
-       .constraints            = nhmex_uncore_bbox_constraints,
-       .ops                    = &nhmex_uncore_bbox_ops,
-       .format_group           = &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       /* only TO_R_PROG_EV event uses the match/mask register */
-       if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
-           NHMEX_S_EVENT_TO_R_PROG_EV)
-               return 0;
-
-       if (box->pmu->pmu_idx == 0)
-               reg1->reg = NHMEX_S0_MSR_MM_CFG;
-       else
-               reg1->reg = NHMEX_S1_MSR_MM_CFG;
-       reg1->idx = 0;
-       reg1->config = event->attr.config1;
-       reg2->config = event->attr.config2;
-       return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg1->reg, 0);
-               wrmsrl(reg1->reg + 1, reg1->config);
-               wrmsrl(reg1->reg + 2, reg2->config);
-               wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
-       }
-       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match.attr,
-       &format_attr_mask.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
-       .name                   = "format",
-       .attrs                  = nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_sbox_msr_enable_event,
-       .hw_config              = nhmex_sbox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_S0_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_S0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
-       .msr_offset             = NHMEX_S_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 1,
-       .ops                    = &nhmex_uncore_sbox_ops,
-       .format_group           = &nhmex_uncore_sbox_format_group
-};
-
-enum {
-       EXTRA_REG_NHMEX_M_FILTER,
-       EXTRA_REG_NHMEX_M_DSP,
-       EXTRA_REG_NHMEX_M_ISS,
-       EXTRA_REG_NHMEX_M_MAP,
-       EXTRA_REG_NHMEX_M_MSC_THR,
-       EXTRA_REG_NHMEX_M_PGT,
-       EXTRA_REG_NHMEX_M_PLD,
-       EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
-       MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
-       MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
-       MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
-       MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
-       /* event 0xa uses two extra registers */
-       MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
-       MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
-       MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
-       /* events 0xd ~ 0x10 use the same extra register */
-       MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
-       EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       bool ret = false;
-       u64 mask;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               er = &box->shared_regs[idx];
-               raw_spin_lock_irqsave(&er->lock, flags);
-               if (!atomic_read(&er->ref) || er->config == config) {
-                       atomic_inc(&er->ref);
-                       er->config = config;
-                       ret = true;
-               }
-               raw_spin_unlock_irqrestore(&er->lock, flags);
-
-               return ret;
-       }
-       /*
-        * The ZDP_CTL_FVC MSR has 4 fields which are used to control
-        * events 0xd ~ 0x10. Besides these 4 fields, there are additional
-        * fields which are shared.
-        */
-       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       if (WARN_ON_ONCE(idx >= 4))
-               return false;
-
-       /* mask of the shared fields */
-       if (uncore_nhmex)
-               mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
-       else
-               mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       /* add mask of the non-shared field if it's in use */
-       if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
-               if (uncore_nhmex)
-                       mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               else
-                       mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       }
-
-       if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
-               atomic_add(1 << (idx * 8), &er->ref);
-               if (uncore_nhmex)
-                       mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
-                               NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               else
-                       mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
-                               WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               er->config &= ~mask;
-               er->config |= (config & mask);
-               ret = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               er = &box->shared_regs[idx];
-               atomic_dec(&er->ref);
-               return;
-       }
-
-       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-       atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
-       u64 config = reg1->config;
-
-       /* get the non-shared control bits and shift them */
-       idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       if (uncore_nhmex)
-               config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       else
-               config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       if (new_idx > orig_idx) {
-               idx = new_idx - orig_idx;
-               config <<= 3 * idx;
-       } else {
-               idx = orig_idx - new_idx;
-               config >>= 3 * idx;
-       }
-
-       /* add the shared control bits back */
-       if (uncore_nhmex)
-               config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       else
-               config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       if (modify) {
-               /* adjust the main event selector */
-               if (new_idx > orig_idx)
-                       hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-               else
-                       hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-               reg1->config = config;
-               reg1->idx = ~0xff | new_idx;
-       }
-       return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       int i, idx[2], alloc = 0;
-       u64 config1 = reg1->config;
-
-       idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
-       idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
-       for (i = 0; i < 2; i++) {
-               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-                       idx[i] = 0xff;
-
-               if (idx[i] == 0xff)
-                       continue;
-
-               if (!nhmex_mbox_get_shared_reg(box, idx[i],
-                               __BITS_VALUE(config1, i, 32)))
-                       goto fail;
-               alloc |= (0x1 << i);
-       }
-
-       /* for the match/mask registers */
-       if (reg2->idx != EXTRA_REG_NONE &&
-           (uncore_box_is_fake(box) || !reg2->alloc) &&
-           !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
-               goto fail;
-
-       /*
-        * If it's a fake box -- as per validate_{group,event}() we
-        * shouldn't touch event state and we can avoid doing so
-        * since both will only call get_event_constraints() once
-        * on each event, this avoids the need for reg->alloc.
-        */
-       if (!uncore_box_is_fake(box)) {
-               if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
-                       nhmex_mbox_alter_er(event, idx[0], true);
-               reg1->alloc |= alloc;
-               if (reg2->idx != EXTRA_REG_NONE)
-                       reg2->alloc = 1;
-       }
-       return NULL;
-fail:
-       if (idx[0] != 0xff && !(alloc & 0x1) &&
-           idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               /*
-                * events 0xd ~ 0x10 are functional identical, but are
-                * controlled by different fields in the ZDP_CTL_FVC
-                * register. If we failed to take one field, try the
-                * rest 3 choices.
-                */
-               BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
-               idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-               idx[0] = (idx[0] + 1) % 4;
-               idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-               if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
-                       config1 = nhmex_mbox_alter_er(event, idx[0], false);
-                       goto again;
-               }
-       }
-
-       if (alloc & 0x1)
-               nhmex_mbox_put_shared_reg(box, idx[0]);
-       if (alloc & 0x2)
-               nhmex_mbox_put_shared_reg(box, idx[1]);
-       return &uncore_constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
-       if (uncore_box_is_fake(box))
-               return;
-
-       if (reg1->alloc & 0x1)
-               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
-       if (reg1->alloc & 0x2)
-               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
-       reg1->alloc = 0;
-
-       if (reg2->alloc) {
-               nhmex_mbox_put_shared_reg(box, reg2->idx);
-               reg2->alloc = 0;
-       }
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
-       if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-               return er->idx;
-       return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_type *type = box->pmu->type;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       struct extra_reg *er;
-       unsigned msr;
-       int reg_idx = 0;
-       /*
-        * The mbox events may require 2 extra MSRs at the most. But only
-        * the lower 32 bits in these MSRs are significant, so we can use
-        * config1 to pass two MSRs' config.
-        */
-       for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               if (event->attr.config1 & ~er->valid_mask)
-                       return -EINVAL;
-
-               msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
-               if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
-                       return -EINVAL;
-
-               /* always use the 32~63 bits to pass the PLD config */
-               if (er->idx == EXTRA_REG_NHMEX_M_PLD)
-                       reg_idx = 1;
-               else if (WARN_ON_ONCE(reg_idx > 0))
-                       return -EINVAL;
-
-               reg1->idx &= ~(0xff << (reg_idx * 8));
-               reg1->reg &= ~(0xffff << (reg_idx * 16));
-               reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
-               reg1->reg |= msr << (reg_idx * 16);
-               reg1->config = event->attr.config1;
-               reg_idx++;
-       }
-       /*
-        * The mbox only provides ability to perform address matching
-        * for the PLD events.
-        */
-       if (reg_idx == 2) {
-               reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
-               if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
-                       reg2->config = event->attr.config2;
-               else
-                       reg2->config = ~0ULL;
-               if (box->pmu->pmu_idx == 0)
-                       reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
-               else
-                       reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
-       }
-       return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       u64 config;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-               return box->shared_regs[idx].config;
-
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       config = er->config;
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-       return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int idx;
-
-       idx = __BITS_VALUE(reg1->idx, 0, 8);
-       if (idx != 0xff)
-               wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
-                       nhmex_mbox_shared_reg_config(box, idx));
-       idx = __BITS_VALUE(reg1->idx, 1, 8);
-       if (idx != 0xff)
-               wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
-                       nhmex_mbox_shared_reg_config(box, idx));
-
-       if (reg2->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg2->reg, 0);
-               if (reg2->config != ~0ULL) {
-                       wrmsrl(reg2->reg + 1,
-                               reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
-                       wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
-                               (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
-                       wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
-               }
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode,          count_mode,     "config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode,                storage_mode,   "config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,           wrap_mode,      "config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode,           flag_mode,      "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel,             inc_sel,        "config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,                set_flag_sel,   "config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,       filter_cfg_en,  "config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match,                filter_match,   "config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask,         filter_mask,    "config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp,                 dsp,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr,                 thr,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc,                 fvc,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt,                 pgt,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map,                 map,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss,                 iss,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld,                 pld,            "config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
-       &format_attr_count_mode.attr,
-       &format_attr_storage_mode.attr,
-       &format_attr_wrap_mode.attr,
-       &format_attr_flag_mode.attr,
-       &format_attr_inc_sel.attr,
-       &format_attr_set_flag_sel.attr,
-       &format_attr_filter_cfg_en.attr,
-       &format_attr_filter_match.attr,
-       &format_attr_filter_mask.attr,
-       &format_attr_dsp.attr,
-       &format_attr_thr.attr,
-       &format_attr_fvc.attr,
-       &format_attr_pgt.attr,
-       &format_attr_map.attr,
-       &format_attr_iss.attr,
-       &format_attr_pld.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
-       .name           = "format",
-       .attrs          = nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
-       { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event   = nhmex_mbox_msr_enable_event,
-       .hw_config      = nhmex_mbox_hw_config,
-       .get_constraint = nhmex_mbox_get_constraint,
-       .put_constraint = nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
-       .name                   = "mbox",
-       .num_counters           = 6,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_M0_MSR_PMU_CTL0,
-       .perf_ctr               = NHMEX_M0_MSR_PMU_CNT0,
-       .event_mask             = NHMEX_M_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_M0_MSR_GLOBAL_CTL,
-       .msr_offset             = NHMEX_M_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 8,
-       .event_descs            = nhmex_uncore_mbox_events,
-       .ops                    = &nhmex_uncore_mbox_ops,
-       .format_group           = &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       /* adjust the main event selector and extra register index */
-       if (reg1->idx % 2) {
-               reg1->idx--;
-               hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       } else {
-               reg1->idx++;
-               hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       }
-
-       /* adjust extra register config */
-       switch (reg1->idx % 6) {
-       case 2:
-               /* shift the 8~15 bits to the 0~7 bits */
-               reg1->config >>= 8;
-               break;
-       case 3:
-               /* shift the 0~7 bits to the 8~15 bits */
-               reg1->config <<= 8;
-               break;
-       }
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       int idx, er_idx;
-       u64 config1;
-       bool ok = false;
-
-       if (!uncore_box_is_fake(box) && reg1->alloc)
-               return NULL;
-
-       idx = reg1->idx % 6;
-       config1 = reg1->config;
-again:
-       er_idx = idx;
-       /* the 3rd and 4th events use the same extra register */
-       if (er_idx > 2)
-               er_idx--;
-       er_idx += (reg1->idx / 6) * 5;
-
-       er = &box->shared_regs[er_idx];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (idx < 2) {
-               if (!atomic_read(&er->ref) || er->config == reg1->config) {
-                       atomic_inc(&er->ref);
-                       er->config = reg1->config;
-                       ok = true;
-               }
-       } else if (idx == 2 || idx == 3) {
-               /*
-                * these two events use different fields in a extra register,
-                * the 0~7 bits and the 8~15 bits respectively.
-                */
-               u64 mask = 0xff << ((idx - 2) * 8);
-               if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
-                               !((er->config ^ config1) & mask)) {
-                       atomic_add(1 << ((idx - 2) * 8), &er->ref);
-                       er->config &= ~mask;
-                       er->config |= config1 & mask;
-                       ok = true;
-               }
-       } else {
-               if (!atomic_read(&er->ref) ||
-                               (er->config == (hwc->config >> 32) &&
-                                er->config1 == reg1->config &&
-                                er->config2 == reg2->config)) {
-                       atomic_inc(&er->ref);
-                       er->config = (hwc->config >> 32);
-                       er->config1 = reg1->config;
-                       er->config2 = reg2->config;
-                       ok = true;
-               }
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (!ok) {
-               /*
-                * The Rbox events are always in pairs. The paired
-                * events are functional identical, but use different
-                * extra registers. If we failed to take an extra
-                * register, try the alternative.
-                */
-               idx ^= 1;
-               if (idx != reg1->idx % 6) {
-                       if (idx == 2)
-                               config1 >>= 8;
-                       else if (idx == 3)
-                               config1 <<= 8;
-                       goto again;
-               }
-       } else {
-               if (!uncore_box_is_fake(box)) {
-                       if (idx != reg1->idx % 6)
-                               nhmex_rbox_alter_er(box, event);
-                       reg1->alloc = 1;
-               }
-               return NULL;
-       }
-       return &uncore_constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       int idx, er_idx;
-
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       idx = reg1->idx % 6;
-       er_idx = idx;
-       if (er_idx > 2)
-               er_idx--;
-       er_idx += (reg1->idx / 6) * 5;
-
-       er = &box->shared_regs[er_idx];
-       if (idx == 2 || idx == 3)
-               atomic_sub(1 << ((idx - 2) * 8), &er->ref);
-       else
-               atomic_dec(&er->ref);
-
-       reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       int idx;
-
-       idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
-               NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       if (idx >= 0x18)
-               return -EINVAL;
-
-       reg1->idx = idx;
-       reg1->config = event->attr.config1;
-
-       switch (idx % 6) {
-       case 4:
-       case 5:
-               hwc->config |= event->attr.config & (~0ULL << 32);
-               reg2->config = event->attr.config2;
-               break;
-       }
-       return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int idx, port;
-
-       idx = reg1->idx;
-       port = idx / 6 + box->pmu->pmu_idx * 4;
-
-       switch (idx % 6) {
-       case 0:
-               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
-               break;
-       case 1:
-               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
-               break;
-       case 2:
-       case 3:
-               wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
-                       uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
-               break;
-       case 4:
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
-                       hwc->config >> 32);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
-               break;
-       case 5:
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
-                       hwc->config >> 32);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
-               break;
-       }
-
-       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-               (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
-       &format_attr_event5.attr,
-       &format_attr_xbr_mm_cfg.attr,
-       &format_attr_xbr_match.attr,
-       &format_attr_xbr_mask.attr,
-       &format_attr_qlx_cfg.attr,
-       &format_attr_iperf_cfg.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,         "event=0x0,iperf_cfg=0x80000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,         "event=0x6,iperf_cfg=0x80000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,         "event=0x0,iperf_cfg=0x40000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,         "event=0x6,iperf_cfg=0x40000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi0_date_response,     "event=0x0,iperf_cfg=0xc4"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_date_response,     "event=0x6,iperf_cfg=0xc4"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_rbox_msr_enable_event,
-       .hw_config              = nhmex_rbox_hw_config,
-       .get_constraint         = nhmex_rbox_get_constraint,
-       .put_constraint         = nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
-       .name                   = "rbox",
-       .num_counters           = 8,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_R_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_R_MSR_PMON_CNT0,
-       .event_mask             = NHMEX_R_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_R_MSR_GLOBAL_CTL,
-       .msr_offset             = NHMEX_R_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 20,
-       .event_descs            = nhmex_uncore_rbox_events,
-       .ops                    = &nhmex_uncore_rbox_ops,
-       .format_group           = &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
-       &nhmex_uncore_ubox,
-       &nhmex_uncore_cbox,
-       &nhmex_uncore_bbox,
-       &nhmex_uncore_sbox,
-       &nhmex_uncore_mbox,
-       &nhmex_uncore_rbox,
-       &nhmex_uncore_wbox,
-       NULL,
-};
-
-void nhmex_uncore_cpu_init(void)
-{
-       if (boot_cpu_data.x86_model == 46)
-               uncore_nhmex = true;
-       else
-               nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
-       if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = nhmex_msr_uncores;
-}
-/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

deleted file mode 100644 (file)

index 2bd030d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ /dev/null
@@ -1,717 +0,0 @@
-/* Nehalem/SandBridge/Haswell uncore support */
-#include "perf_event_intel_uncore.h"
-
-/* Uncore IMC PCI IDs */
-#define PCI_DEVICE_ID_INTEL_SNB_IMC    0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC    0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
-#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
-#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
-#define PCI_DEVICE_ID_INTEL_SKL_IMC    0x191f
-
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK                 0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET                   (1 << 18)
-#define SNB_UNC_CTL_EN                         (1 << 22)
-#define SNB_UNC_CTL_INVERT                     (1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK                 0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK                 0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN               (1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
-                                                SNB_UNC_CTL_UMASK_MASK | \
-                                                SNB_UNC_CTL_EDGE_DET | \
-                                                SNB_UNC_CTL_INVERT | \
-                                                SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
-                                                SNB_UNC_CTL_UMASK_MASK | \
-                                                SNB_UNC_CTL_EDGE_DET | \
-                                                SNB_UNC_CTL_INVERT | \
-                                                NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
-#define SNB_UNC_FIXED_CTR_CTRL                  0x394
-#define SNB_UNC_FIXED_CTR                       0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
-#define SNB_UNC_CBO_0_PER_CTR0                  0x706
-#define SNB_UNC_CBO_MSR_OFFSET                  0x10
-
-/* SNB ARB register */
-#define SNB_UNC_ARB_PER_CTR0                   0x3b0
-#define SNB_UNC_ARB_PERFEVTSEL0                        0x3b2
-#define SNB_UNC_ARB_MSR_OFFSET                 0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
-#define NHM_UNC_FIXED_CTR                       0x394
-#define NHM_UNC_FIXED_CTR_CTRL                  0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0                     0x3c0
-#define NHM_UNC_UNCORE_PMC0                     0x3b0
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-       else
-               wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->pmu_idx == 0) {
-               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
-                       SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
-       }
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
-       { /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask5.attr,
-       NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
-       .name           = "format",
-       .attrs          = snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
-       .init_box       = snb_uncore_msr_init_box,
-       .disable_event  = snb_uncore_msr_disable_event,
-       .enable_event   = snb_uncore_msr_enable_event,
-       .read_counter   = uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_arb_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
-       .name           = "cbox",
-       .num_counters   = 2,
-       .num_boxes      = 4,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNB_UNC_CBO_0_PER_CTR0,
-       .event_ctl      = SNB_UNC_CBO_0_PERFEVTSEL0,
-       .fixed_ctr      = SNB_UNC_FIXED_CTR,
-       .fixed_ctl      = SNB_UNC_FIXED_CTR_CTRL,
-       .single_fixed   = 1,
-       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
-       .msr_offset     = SNB_UNC_CBO_MSR_OFFSET,
-       .ops            = &snb_uncore_msr_ops,
-       .format_group   = &snb_uncore_format_group,
-       .event_descs    = snb_uncore_events,
-};
-
-static struct intel_uncore_type snb_uncore_arb = {
-       .name           = "arb",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .perf_ctr       = SNB_UNC_ARB_PER_CTR0,
-       .event_ctl      = SNB_UNC_ARB_PERFEVTSEL0,
-       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
-       .msr_offset     = SNB_UNC_ARB_MSR_OFFSET,
-       .constraints    = snb_uncore_arb_constraints,
-       .ops            = &snb_uncore_msr_ops,
-       .format_group   = &snb_uncore_format_group,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
-       &snb_uncore_cbox,
-       &snb_uncore_arb,
-       NULL,
-};
-
-void snb_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = snb_msr_uncores;
-       if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-}
-
-enum {
-       SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
-       INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
-       INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
-       INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
-       { /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK          0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET          0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE            0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS          0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE     0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES         0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE    0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE            SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
-       .name = "format",
-       .attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
-       resource_size_t addr;
-       u32 pci_dword;
-
-       pci_read_config_dword(pdev, where, &pci_dword);
-       addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-       pci_read_config_dword(pdev, where + 4, &pci_dword);
-       addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
-       addr &= ~(PAGE_SIZE - 1);
-
-       box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
-       box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
-       int idx, base;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       pmu = uncore_event_to_pmu(event);
-       /* no device found for this pmu */
-       if (pmu->func_id < 0)
-               return -ENOENT;
-
-       /* Sampling not supported yet */
-       if (hwc->sample_period)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       /*
-        * Place all uncore events for a particular physical package
-        * onto a single cpu
-        */
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       /* check only supported bits are set */
-       if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
-               return -EINVAL;
-
-       box = uncore_pmu_to_box(pmu, event->cpu);
-       if (!box || box->cpu < 0)
-               return -EINVAL;
-
-       event->cpu = box->cpu;
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-       /*
-        * check event is known (whitelist, determines counter)
-        */
-       switch (cfg) {
-       case SNB_UNCORE_PCI_IMC_DATA_READS:
-               base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-               idx = UNCORE_PMC_IDX_FIXED;
-               break;
-       case SNB_UNCORE_PCI_IMC_DATA_WRITES:
-               base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-               idx = UNCORE_PMC_IDX_FIXED + 1;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       /* must be done before validate_group */
-       event->hw.event_base = base;
-       event->hw.config = cfg;
-       event->hw.idx = idx;
-
-       /* no group validation needed, we have free running counters */
-
-       return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       u64 count;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       event->hw.state = 0;
-       box->n_active++;
-
-       list_add_tail(&event->active_entry, &box->active_list);
-
-       count = snb_uncore_imc_read_counter(box, event);
-       local64_set(&event->hw.prev_count, count);
-
-       if (box->n_active == 1)
-               uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!(hwc->state & PERF_HES_STOPPED)) {
-               box->n_active--;
-
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-
-               list_del(&event->active_entry);
-
-               if (box->n_active == 0)
-                       uncore_pmu_cancel_hrtimer(box);
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               uncore_perf_event_update(box, event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!box)
-               return -ENODEV;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       snb_uncore_imc_event_start(event, 0);
-
-       box->n_events++;
-
-       return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int i;
-
-       snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < box->n_events; i++) {
-               if (event == box->event_list[i]) {
-                       --box->n_events;
-                       break;
-               }
-       }
-}
-
-int snb_pci2phy_map_init(int devid)
-{
-       struct pci_dev *dev = NULL;
-       struct pci2phy_map *map;
-       int bus, segment;
-
-       dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
-       if (!dev)
-               return -ENOTTY;
-
-       bus = dev->bus->number;
-       segment = pci_domain_nr(dev->bus);
-
-       raw_spin_lock(&pci2phy_map_lock);
-       map = __find_pci2phy_map(segment);
-       if (!map) {
-               raw_spin_unlock(&pci2phy_map_lock);
-               pci_dev_put(dev);
-               return -ENOMEM;
-       }
-       map->pbus_to_physid[bus] = 0;
-       raw_spin_unlock(&pci2phy_map_lock);
-
-       pci_dev_put(dev);
-
-       return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = snb_uncore_imc_event_init,
-       .add            = snb_uncore_imc_event_add,
-       .del            = snb_uncore_imc_event_del,
-       .start          = snb_uncore_imc_event_start,
-       .stop           = snb_uncore_imc_event_stop,
-       .read           = uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
-       .init_box       = snb_uncore_imc_init_box,
-       .enable_box     = snb_uncore_imc_enable_box,
-       .disable_box    = snb_uncore_imc_disable_box,
-       .disable_event  = snb_uncore_imc_disable_event,
-       .enable_event   = snb_uncore_imc_enable_event,
-       .hw_config      = snb_uncore_imc_hw_config,
-       .read_counter   = snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .fixed_ctr_bits = 32,
-       .fixed_ctr      = SNB_UNCORE_PCI_IMC_CTR_BASE,
-       .event_descs    = snb_uncore_imc_events,
-       .format_group   = &snb_uncore_imc_format_group,
-       .perf_ctr       = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-       .event_mask     = SNB_UNCORE_PCI_IMC_EVENT_MASK,
-       .ops            = &snb_uncore_imc_ops,
-       .pmu            = &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
-       [SNB_PCI_UNCORE_IMC]    = &snb_uncore_imc,
-       NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id bdw_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id skl_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
-       .name           = "snb_uncore",
-       .id_table       = snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
-       .name           = "ivb_uncore",
-       .id_table       = ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
-       .name           = "hsw_uncore",
-       .id_table       = hsw_uncore_pci_ids,
-};
-
-static struct pci_driver bdw_uncore_pci_driver = {
-       .name           = "bdw_uncore",
-       .id_table       = bdw_uncore_pci_ids,
-};
-
-static struct pci_driver skl_uncore_pci_driver = {
-       .name           = "skl_uncore",
-       .id_table       = skl_uncore_pci_ids,
-};
-
-struct imc_uncore_pci_dev {
-       __u32 pci_id;
-       struct pci_driver *driver;
-};
-#define IMC_DEV(a, d) \
-       { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
-
-static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
-       IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
-       IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
-       IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
-       IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
-       IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
-       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
-       IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
-       {  /* end marker */ }
-};
-
-
-#define for_each_imc_pci_id(x, t) \
-       for (x = (t); (x)->pci_id; x++)
-
-static struct pci_driver *imc_uncore_find_dev(void)
-{
-       const struct imc_uncore_pci_dev *p;
-       int ret;
-
-       for_each_imc_pci_id(p, desktop_imc_pci_ids) {
-               ret = snb_pci2phy_map_init(p->pci_id);
-               if (ret == 0)
-                       return p->driver;
-       }
-       return NULL;
-}
-
-static int imc_uncore_pci_init(void)
-{
-       struct pci_driver *imc_drv = imc_uncore_find_dev();
-
-       if (!imc_drv)
-               return -ENODEV;
-
-       uncore_pci_uncores = snb_pci_uncores;
-       uncore_pci_driver = imc_drv;
-
-       return 0;
-}
-
-int snb_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int ivb_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-int hsw_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int bdw_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int skl_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-       else
-               wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask8.attr,
-       NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
-       .name = "format",
-       .attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
-       INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
-       .disable_box    = nhm_uncore_msr_disable_box,
-       .enable_box     = nhm_uncore_msr_enable_box,
-       .disable_event  = snb_uncore_msr_disable_event,
-       .enable_event   = nhm_uncore_msr_enable_event,
-       .read_counter   = uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
-       .name           = "",
-       .num_counters   = 8,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .event_ctl      = NHM_UNC_PERFEVTSEL0,
-       .perf_ctr       = NHM_UNC_UNCORE_PMC0,
-       .fixed_ctr      = NHM_UNC_FIXED_CTR,
-       .fixed_ctl      = NHM_UNC_FIXED_CTR_CTRL,
-       .event_mask     = NHM_UNC_RAW_EVENT_MASK,
-       .event_descs    = nhm_uncore_events,
-       .ops            = &nhm_uncore_msr_ops,
-       .format_group   = &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
-       &nhm_uncore,
-       NULL,
-};
-
-void nhm_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = nhm_msr_uncores;
-}
-
-/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c

deleted file mode 100644 (file)

index 33acb88..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ /dev/null
@@ -1,3126 +0,0 @@
-/* SandyBridge-EP/IvyTown uncore support */
-#include "perf_event_intel_uncore.h"
-
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ         (1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN      (1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
-                                        SNBEP_PMON_BOX_CTL_RST_CTRS | \
-                                        SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK     0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK      0x0000ff00
-#define SNBEP_PMON_CTL_RST             (1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET                (1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT      (1 << 21)
-#define SNBEP_PMON_CTL_EN              (1 << 22)
-#define SNBEP_PMON_CTL_INVERT          (1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK      0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK      (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                        SNBEP_PMON_CTL_UMASK_MASK | \
-                                        SNBEP_PMON_CTL_EDGE_DET | \
-                                        SNBEP_PMON_CTL_INVERT | \
-                                        SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK                0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK                \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_UMASK_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN              (1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK      (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK    0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK      0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT      (1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET    (1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_RAW_EVENT_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL                 0xf4
-#define SNBEP_PCI_PMON_CTL0                    0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0                    0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0       0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1       0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH      0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL                0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR                0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0         0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1         0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0          0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1          0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0                  0xc16
-#define SNBEP_U_MSR_PMON_CTL0                  0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL                0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR                0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0                 0xd16
-#define SNBEP_C0_MSR_PMON_CTL0                 0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL              0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER           0xd14
-#define SNBEP_CBO_MSR_OFFSET                   0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID      0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID      0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE    0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC      0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {   \
-       .event = (e),                           \
-       .msr = SNBEP_C0_MSR_PMON_BOX_FILTER,    \
-       .config_mask = (m),                     \
-       .idx = (i)                              \
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0                        0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0                        0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL             0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER          0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK     0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR              0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR              0x3fd
-
-/* IVBEP event control */
-#define IVBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
-                                        SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVBEP_PMON_RAW_EVENT_MASK              (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                        SNBEP_PMON_CTL_UMASK_MASK | \
-                                        SNBEP_PMON_CTL_EDGE_DET | \
-                                        SNBEP_PMON_CTL_TRESH_MASK)
-/* IVBEP Ubox */
-#define IVBEP_U_MSR_PMON_GLOBAL_CTL            0xc00
-#define IVBEP_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
-#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL          (1 << 29)
-
-#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK        \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_UMASK_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVBEP Cbo */
-#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK              (IVBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x1fULL << 0)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 5)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x3fULL << 17)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
-
-/* IVBEP home agent */
-#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST                (1 << 16)
-#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK               \
-                               (IVBEP_PMON_RAW_EVENT_MASK | \
-                                IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVBEP PCU */
-#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVBEP QPI */
-#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
-                               (IVBEP_PMON_RAW_EVENT_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-                               ((1ULL << (n)) - 1)))
-
-/* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0                  0x709
-#define HSWEP_U_MSR_PMON_CTL0                  0x705
-#define HSWEP_U_MSR_PMON_FILTER                        0x707
-
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR                0x704
-
-#define HSWEP_U_MSR_PMON_BOX_FILTER_TID                (0x1 << 0)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_CID                (0x1fULL << 1)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
-                                       (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
-                                        HSWEP_U_MSR_PMON_BOX_FILTER_CID)
-
-/* Haswell-EP CBo */
-#define HSWEP_C0_MSR_PMON_CTR0                 0xe08
-#define HSWEP_C0_MSR_PMON_CTL0                 0xe01
-#define HSWEP_C0_MSR_PMON_BOX_CTL                      0xe00
-#define HSWEP_C0_MSR_PMON_BOX_FILTER0          0xe05
-#define HSWEP_CBO_MSR_OFFSET                   0x10
-
-
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x3fULL << 0)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 6)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x7fULL << 17)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
-
-
-/* Haswell-EP Sbox */
-#define HSWEP_S0_MSR_PMON_CTR0                 0x726
-#define HSWEP_S0_MSR_PMON_CTL0                 0x721
-#define HSWEP_S0_MSR_PMON_BOX_CTL                      0x720
-#define HSWEP_SBOX_MSR_OFFSET                  0xa
-#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* Haswell-EP PCU */
-#define HSWEP_PCU_MSR_PMON_CTR0                        0x717
-#define HSWEP_PCU_MSR_PMON_CTL0                        0x711
-#define HSWEP_PCU_MSR_PMON_BOX_CTL             0x710
-#define HSWEP_PCU_MSR_PMON_BOX_FILTER          0x715
-
-/* KNL Ubox */
-#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
-                                       (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
-                                               SNBEP_CBO_PMON_CTL_TID_EN)
-/* KNL CHA */
-#define KNL_CHA_MSR_OFFSET                     0xc
-#define KNL_CHA_MSR_PMON_CTL_QOR               (1 << 16)
-#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
-                                       (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
-                                        KNL_CHA_MSR_PMON_CTL_QOR)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_TID                0x1ff
-#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE      (7 << 18)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_OP         (0xfffffe2aULL << 32)
-
-/* KNL EDC/MC UCLK */
-#define KNL_UCLK_MSR_PMON_CTR0_LOW             0x400
-#define KNL_UCLK_MSR_PMON_CTL0                 0x420
-#define KNL_UCLK_MSR_PMON_BOX_CTL              0x430
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW       0x44c
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL       0x454
-#define KNL_PMON_FIXED_CTL_EN                  0x1
-
-/* KNL EDC */
-#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW                0xa00
-#define KNL_EDC0_ECLK_MSR_PMON_CTL0            0xa20
-#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL         0xa30
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW  0xa3c
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL  0xa44
-
-/* KNL MC */
-#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW          0xb00
-#define KNL_MC0_CH0_MSR_PMON_CTL0              0xb20
-#define KNL_MC0_CH0_MSR_PMON_BOX_CTL           0xb30
-#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW         0xb3c
-#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL         0xb44
-
-/* KNL IRP */
-#define KNL_IRP_PCI_PMON_BOX_CTL               0xf0
-#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                KNL_CHA_MSR_PMON_CTL_QOR)
-/* KNL PCU */
-#define KNL_PCU_PMON_CTL_EV_SEL_MASK           0x0000007f
-#define KNL_PCU_PMON_CTL_USE_OCC_CTR           (1 << 7)
-#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK                0x3f000000
-#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK        \
-                               (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
-                                KNL_PCU_PMON_CTL_USE_OCC_CTR | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_CBO_PMON_CTL_TID_EN | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
-DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
-DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
-DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-       u32 config = 0;
-
-       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-               config |= SNBEP_PMON_BOX_CTL_FRZ;
-               pci_write_config_dword(pdev, box_ctl, config);
-       }
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-       u32 config = 0;
-
-       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-               pci_write_config_dword(pdev, box_ctl, config);
-       }
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
-       pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-
-       pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       u64 config;
-       unsigned msr;
-
-       msr = uncore_msr_box_ctl(box);
-       if (msr) {
-               rdmsrl(msr, config);
-               config |= SNBEP_PMON_BOX_CTL_FRZ;
-               wrmsrl(msr, config);
-       }
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       u64 config;
-       unsigned msr;
-
-       msr = uncore_msr_box_ctl(box);
-       if (msr) {
-               rdmsrl(msr, config);
-               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-               wrmsrl(msr, config);
-       }
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE)
-               wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
-                                       struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-
-       if (msr)
-               wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid.attr,
-       &format_attr_filter_nid.attr,
-       &format_attr_filter_state.attr,
-       &format_attr_filter_opc.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge.attr,
-       &format_attr_filter_band0.attr,
-       &format_attr_filter_band1.attr,
-       &format_attr_filter_band2.attr,
-       &format_attr_filter_band3.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match_rds.attr,
-       &format_attr_match_rnid30.attr,
-       &format_attr_match_rnid4.attr,
-       &format_attr_match_dnid.attr,
-       &format_attr_match_mc.attr,
-       &format_attr_match_opc.attr,
-       &format_attr_match_vnw.attr,
-       &format_attr_match0.attr,
-       &format_attr_match1.attr,
-       &format_attr_mask_rds.attr,
-       &format_attr_mask_rnid30.attr,
-       &format_attr_mask_rnid4.attr,
-       &format_attr_mask_dnid.attr,
-       &format_attr_mask_mc.attr,
-       &format_attr_mask_opc.attr,
-       &format_attr_mask_vnw.attr,
-       &format_attr_mask0.attr,
-       &format_attr_mask1.attr,
-       NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-       { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
-       INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
-       INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
-       INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
-       { /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                   \
-       .disable_box    = snbep_uncore_msr_disable_box,         \
-       .enable_box     = snbep_uncore_msr_enable_box,          \
-       .disable_event  = snbep_uncore_msr_disable_event,       \
-       .enable_event   = snbep_uncore_msr_enable_event,        \
-       .read_counter   = uncore_msr_read_counter
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
-       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),                   \
-       .init_box       = snbep_uncore_msr_init_box             \
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()                     \
-       .init_box       = snbep_uncore_pci_init_box,            \
-       .disable_box    = snbep_uncore_pci_disable_box,         \
-       .enable_box     = snbep_uncore_pci_enable_box,          \
-       .disable_event  = snbep_uncore_pci_disable_event,       \
-       .read_counter   = snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
-       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-       .enable_event   = snbep_uncore_pci_enable_event,        \
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
-       EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
-       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
-       .event_mask     = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops            = &snbep_uncore_msr_ops,
-       .format_group   = &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
-       EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       int i;
-
-       if (uncore_box_is_fake(box))
-               return;
-
-       for (i = 0; i < 5; i++) {
-               if (reg1->alloc & (0x1 << i))
-                       atomic_sub(1 << (i * 6), &er->ref);
-       }
-       reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
-                           u64 (*cbox_filter_mask)(int fields))
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       int i, alloc = 0;
-       unsigned long flags;
-       u64 mask;
-
-       if (reg1->idx == EXTRA_REG_NONE)
-               return NULL;
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       for (i = 0; i < 5; i++) {
-               if (!(reg1->idx & (0x1 << i)))
-                       continue;
-               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-                       continue;
-
-               mask = cbox_filter_mask(0x1 << i);
-               if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
-                   !((reg1->config ^ er->config) & mask)) {
-                       atomic_add(1 << (i * 6), &er->ref);
-                       er->config &= ~mask;
-                       er->config |= reg1->config & mask;
-                       alloc |= (0x1 << i);
-               } else {
-                       break;
-               }
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-       if (i < 5)
-               goto fail;
-
-       if (!uncore_box_is_fake(box))
-               reg1->alloc |= alloc;
-
-       return NULL;
-fail:
-       for (; i >= 0; i--) {
-               if (alloc & (0x1 << i))
-                       atomic_sub(1 << (i * 6), &er->ref);
-       }
-       return &uncore_constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x4)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-       return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_cbox_hw_config,
-       .get_constraint         = snbep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = snbep_uncore_cbox_constraints,
-       .ops                    = &snbep_uncore_cbox_ops,
-       .format_group           = &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       u64 config = reg1->config;
-
-       if (new_idx > reg1->idx)
-               config <<= 8 * (new_idx - reg1->idx);
-       else
-               config >>= 8 * (reg1->idx - new_idx);
-
-       if (modify) {
-               hwc->config += new_idx - reg1->idx;
-               reg1->config = config;
-               reg1->idx = new_idx;
-       }
-       return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       unsigned long flags;
-       int idx = reg1->idx;
-       u64 mask, config1 = reg1->config;
-       bool ok = false;
-
-       if (reg1->idx == EXTRA_REG_NONE ||
-           (!uncore_box_is_fake(box) && reg1->alloc))
-               return NULL;
-again:
-       mask = 0xffULL << (idx * 8);
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
-           !((config1 ^ er->config) & mask)) {
-               atomic_add(1 << (idx * 8), &er->ref);
-               er->config &= ~mask;
-               er->config |= config1 & mask;
-               ok = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (!ok) {
-               idx = (idx + 1) % 4;
-               if (idx != reg1->idx) {
-                       config1 = snbep_pcu_alter_er(event, idx, false);
-                       goto again;
-               }
-               return &uncore_constraint_empty;
-       }
-
-       if (!uncore_box_is_fake(box)) {
-               if (idx != reg1->idx)
-                       snbep_pcu_alter_er(event, idx, true);
-               reg1->alloc = 1;
-       }
-       return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       atomic_sub(1 << (reg1->idx * 8), &er->ref);
-       reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-       if (ev_sel >= 0xb && ev_sel <= 0xe) {
-               reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
-               reg1->idx = ev_sel - 0xb;
-               reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_pcu_ops,
-       .format_group           = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
-       &snbep_uncore_ubox,
-       &snbep_uncore_cbox,
-       &snbep_uncore_pcu,
-       NULL,
-};
-
-void snbep_uncore_cpu_init(void)
-{
-       if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = snbep_msr_uncores;
-}
-
-enum {
-       SNBEP_PCI_QPI_PORT0_FILTER,
-       SNBEP_PCI_QPI_PORT1_FILTER,
-       HSWEP_PCI_PCU_3,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
-               reg1->idx = 0;
-               reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
-               reg1->config = event->attr.config1;
-               reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
-               reg2->config = event->attr.config2;
-       }
-       return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-               struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
-               if (filter_pdev) {
-                       pci_write_config_dword(filter_pdev, reg1->reg,
-                                               (u32)reg1->config);
-                       pci_write_config_dword(filter_pdev, reg1->reg + 4,
-                                               (u32)(reg1->config >> 32));
-                       pci_write_config_dword(filter_pdev, reg2->reg,
-                                               (u32)reg2->config);
-                       pci_write_config_dword(filter_pdev, reg2->reg + 4,
-                                               (u32)(reg2->config >> 32));
-               }
-       }
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
-       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-       .enable_event           = snbep_qpi_enable_event,
-       .hw_config              = snbep_qpi_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT()                         \
-       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
-       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
-       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,            \
-       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
-       .ops            = &snbep_uncore_pci_ops,                \
-       .format_group   = &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 4,
-       .num_boxes      = 4,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = snbep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .event_descs            = snbep_uncore_qpi_events,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       SNBEP_PCI_UNCORE_HA,
-       SNBEP_PCI_UNCORE_IMC,
-       SNBEP_PCI_UNCORE_QPI,
-       SNBEP_PCI_UNCORE_R2PCIE,
-       SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
-       [SNBEP_PCI_UNCORE_HA]           = &snbep_uncore_ha,
-       [SNBEP_PCI_UNCORE_IMC]          = &snbep_uncore_imc,
-       [SNBEP_PCI_UNCORE_QPI]          = &snbep_uncore_qpi,
-       [SNBEP_PCI_UNCORE_R2PCIE]       = &snbep_uncore_r2pcie,
-       [SNBEP_PCI_UNCORE_R3QPI]        = &snbep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
-       { /* Home Agent */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
-       },
-       { /* MC Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* QPI Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
-       .name           = "snbep_uncore",
-       .id_table       = snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
-       struct pci_dev *ubox_dev = NULL;
-       int i, bus, nodeid, segment;
-       struct pci2phy_map *map;
-       int err = 0;
-       u32 config = 0;
-
-       while (1) {
-               /* find the UBOX device */
-               ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
-               if (!ubox_dev)
-                       break;
-               bus = ubox_dev->bus->number;
-               /* get the Node ID of the local register */
-               err = pci_read_config_dword(ubox_dev, 0x40, &config);
-               if (err)
-                       break;
-               nodeid = config;
-               /* get the Node ID mapping */
-               err = pci_read_config_dword(ubox_dev, 0x54, &config);
-               if (err)
-                       break;
-
-               segment = pci_domain_nr(ubox_dev->bus);
-               raw_spin_lock(&pci2phy_map_lock);
-               map = __find_pci2phy_map(segment);
-               if (!map) {
-                       raw_spin_unlock(&pci2phy_map_lock);
-                       err = -ENOMEM;
-                       break;
-               }
-
-               /*
-                * every three bits in the Node ID mapping register maps
-                * to a particular node.
-                */
-               for (i = 0; i < 8; i++) {
-                       if (nodeid == ((config >> (3 * i)) & 0x7)) {
-                               map->pbus_to_physid[bus] = i;
-                               break;
-                       }
-               }
-               raw_spin_unlock(&pci2phy_map_lock);
-       }
-
-       if (!err) {
-               /*
-                * For PCI bus with no UBOX device, find the next bus
-                * that has UBOX device and use its mapping.
-                */
-               raw_spin_lock(&pci2phy_map_lock);
-               list_for_each_entry(map, &pci2phy_map_head, list) {
-                       i = -1;
-                       for (bus = 255; bus >= 0; bus--) {
-                               if (map->pbus_to_physid[bus] >= 0)
-                                       i = map->pbus_to_physid[bus];
-                               else
-                                       map->pbus_to_physid[bus] = i;
-                       }
-               }
-               raw_spin_unlock(&pci2phy_map_lock);
-       }
-
-       pci_dev_put(ubox_dev);
-
-       return err ? pcibios_err_to_errno(err) : 0;
-}
-
-int snbep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x3ce0);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = snbep_pci_uncores;
-       uncore_pci_driver = &snbep_uncore_pci_driver;
-       return 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       if (msr)
-               wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
-}
-
-static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-
-       pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
-}
-
-#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
-       .init_box       = ivbep_uncore_msr_init_box,            \
-       .disable_box    = snbep_uncore_msr_disable_box,         \
-       .enable_box     = snbep_uncore_msr_enable_box,          \
-       .disable_event  = snbep_uncore_msr_disable_event,       \
-       .enable_event   = snbep_uncore_msr_enable_event,        \
-       .read_counter   = uncore_msr_read_counter
-
-static struct intel_uncore_ops ivbep_uncore_msr_ops = {
-       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivbep_uncore_pci_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = snbep_uncore_pci_disable_event,
-       .enable_event   = snbep_uncore_pci_enable_event,
-       .read_counter   = snbep_uncore_pci_read_counter,
-};
-
-#define IVBEP_UNCORE_PCI_COMMON_INIT()                         \
-       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
-       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
-       .event_mask     = IVBEP_PMON_RAW_EVENT_MASK,            \
-       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
-       .ops            = &ivbep_uncore_pci_ops,                        \
-       .format_group   = &ivbep_uncore_format_group
-
-static struct attribute *ivbep_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid.attr,
-       &format_attr_filter_link.attr,
-       &format_attr_filter_state2.attr,
-       &format_attr_filter_nid2.attr,
-       &format_attr_filter_opc2.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_c6.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge.attr,
-       &format_attr_filter_band0.attr,
-       &format_attr_filter_band1.attr,
-       &format_attr_filter_band2.attr,
-       &format_attr_filter_band3.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match_rds.attr,
-       &format_attr_match_rnid30.attr,
-       &format_attr_match_rnid4.attr,
-       &format_attr_match_dnid.attr,
-       &format_attr_match_mc.attr,
-       &format_attr_match_opc.attr,
-       &format_attr_match_vnw.attr,
-       &format_attr_match0.attr,
-       &format_attr_match1.attr,
-       &format_attr_mask_rds.attr,
-       &format_attr_mask_rnid30.attr,
-       &format_attr_mask_rnid4.attr,
-       &format_attr_mask_dnid.attr,
-       &format_attr_mask_mc.attr,
-       &format_attr_mask_opc.attr,
-       &format_attr_mask_vnw.attr,
-       &format_attr_mask0.attr,
-       &format_attr_mask1.attr,
-       NULL,
-};
-
-static struct attribute_group ivbep_uncore_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_qpi_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivbep_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
-       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
-       .event_mask     = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops            = &ivbep_uncore_msr_ops,
-       .format_group   = &ivbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-       EVENT_EXTRA_END
-};
-
-static u64 ivbep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-       if (fields & 0x4)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x10) {
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-       }
-
-       return mask;
-}
-
-static struct event_constraint *
-ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
-}
-
-static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               u64 filter = uncore_shared_reg_config(box, 0);
-               wrmsrl(reg1->reg, filter & 0xffffffff);
-               wrmsrl(reg1->reg + 6, filter >> 32);
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
-       .init_box               = ivbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = ivbep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = ivbep_cbox_hw_config,
-       .get_constraint         = ivbep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 15,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
-       .event_mask             = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = snbep_uncore_cbox_constraints,
-       .ops                    = &ivbep_uncore_cbox_ops,
-       .format_group           = &ivbep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
-       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_pcu_ops,
-       .format_group           = &ivbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivbep_msr_uncores[] = {
-       &ivbep_uncore_ubox,
-       &ivbep_uncore_cbox,
-       &ivbep_uncore_pcu,
-       NULL,
-};
-
-void ivbep_uncore_cpu_init(void)
-{
-       if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = ivbep_msr_uncores;
-}
-
-static struct intel_uncore_type ivbep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 4,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = snbep_uncore_imc_events,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
-                              hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static struct intel_uncore_ops ivbep_uncore_irp_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = ivbep_uncore_irp_disable_event,
-       .enable_event   = ivbep_uncore_irp_enable_event,
-       .read_counter   = ivbep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivbep_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = IVBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &ivbep_uncore_irp_ops,
-       .format_group           = &ivbep_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = snbep_uncore_pci_disable_event,
-       .enable_event   = snbep_qpi_enable_event,
-       .read_counter   = snbep_uncore_pci_read_counter,
-       .hw_config      = snbep_qpi_hw_config,
-       .get_constraint = uncore_get_constraint,
-       .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_qpi_ops,
-       .format_group           = &ivbep_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivbep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r2pcie_constraints,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r3qpi_constraints,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       IVBEP_PCI_UNCORE_HA,
-       IVBEP_PCI_UNCORE_IMC,
-       IVBEP_PCI_UNCORE_IRP,
-       IVBEP_PCI_UNCORE_QPI,
-       IVBEP_PCI_UNCORE_R2PCIE,
-       IVBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivbep_pci_uncores[] = {
-       [IVBEP_PCI_UNCORE_HA]   = &ivbep_uncore_ha,
-       [IVBEP_PCI_UNCORE_IMC]  = &ivbep_uncore_imc,
-       [IVBEP_PCI_UNCORE_IRP]  = &ivbep_uncore_irp,
-       [IVBEP_PCI_UNCORE_QPI]  = &ivbep_uncore_qpi,
-       [IVBEP_PCI_UNCORE_R2PCIE]       = &ivbep_uncore_r2pcie,
-       [IVBEP_PCI_UNCORE_R3QPI]        = &ivbep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id ivbep_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 4 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 4 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver ivbep_uncore_pci_driver = {
-       .name           = "ivbep_uncore",
-       .id_table       = ivbep_uncore_pci_ids,
-};
-
-int ivbep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x0e1e);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = ivbep_pci_uncores;
-       uncore_pci_driver = &ivbep_uncore_pci_driver;
-       return 0;
-}
-/* end of IvyTown uncore support */
-
-/* KNL uncore support */
-static struct attribute *knl_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = KNL_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops                    = &snbep_uncore_msr_ops,
-       .format_group           = &knl_uncore_ubox_format_group,
-};
-
-static struct attribute *knl_uncore_cha_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_qor.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid4.attr,
-       &format_attr_filter_link3.attr,
-       &format_attr_filter_state4.attr,
-       &format_attr_filter_local.attr,
-       &format_attr_filter_all_op.attr,
-       &format_attr_filter_nnm.attr,
-       &format_attr_filter_opc3.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_cha_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_cha_formats_attr,
-};
-
-static struct event_constraint knl_uncore_cha_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg knl_uncore_cha_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
-       EVENT_EXTRA_END
-};
-
-static u64 knl_cha_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x4)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
-       return mask;
-}
-
-static struct event_constraint *
-knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
-}
-
-static int knl_cha_hw_config(struct intel_uncore_box *box,
-                            struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-                           KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-                                   struct perf_event *event);
-
-static struct intel_uncore_ops knl_uncore_cha_ops = {
-       .init_box               = snbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = hswep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = knl_cha_hw_config,
-       .get_constraint         = knl_cha_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type knl_uncore_cha = {
-       .name                   = "cha",
-       .num_counters           = 4,
-       .num_boxes              = 38,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = KNL_CHA_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = knl_uncore_cha_constraints,
-       .ops                    = &knl_uncore_cha_ops,
-       .format_group           = &knl_uncore_cha_format_group,
-};
-
-static struct attribute *knl_uncore_pcu_formats_attr[] = {
-       &format_attr_event2.attr,
-       &format_attr_use_occ_ctr.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh6.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge_det.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_pcu_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
-       .ops                    = &snbep_uncore_msr_ops,
-       .format_group           = &knl_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *knl_msr_uncores[] = {
-       &knl_uncore_ubox,
-       &knl_uncore_cha,
-       &knl_uncore_pcu,
-       NULL,
-};
-
-void knl_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = knl_msr_uncores;
-}
-
-static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-
-       pci_write_config_dword(pdev, box_ctl, 0);
-}
-
-static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
-                                       struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
-                                                       == UNCORE_FIXED_EVENT)
-               pci_write_config_dword(pdev, hwc->config_base,
-                                      hwc->config | KNL_PMON_FIXED_CTL_EN);
-       else
-               pci_write_config_dword(pdev, hwc->config_base,
-                                      hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops knl_uncore_imc_ops = {
-       .init_box       = snbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = knl_uncore_imc_enable_box,
-       .read_counter   = snbep_uncore_pci_read_counter,
-       .enable_event   = knl_uncore_imc_enable_event,
-       .disable_event  = snbep_uncore_pci_disable_event,
-};
-
-static struct intel_uncore_type knl_uncore_imc_uclk = {
-       .name                   = "imc_uclk",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_imc_dclk = {
-       .name                   = "imc",
-       .num_counters           = 4,
-       .num_boxes              = 6,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_MC0_CH0_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
-       .fixed_ctl              = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
-       .box_ctl                = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_uclk = {
-       .name                   = "edc_uclk",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_eclk = {
-       .name                   = "edc_eclk",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_EDC0_ECLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
-       .fixed_ctl              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
-       .box_ctl                = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct event_constraint knl_uncore_m2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type knl_uncore_m2pcie = {
-       .name           = "m2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = knl_uncore_m2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct attribute *knl_uncore_irp_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_qor.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_irp_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_irp_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = KNL_IRP_PCI_PMON_BOX_CTL,
-       .ops                    = &snbep_uncore_pci_ops,
-       .format_group           = &knl_uncore_irp_format_group,
-};
-
-enum {
-       KNL_PCI_UNCORE_MC_UCLK,
-       KNL_PCI_UNCORE_MC_DCLK,
-       KNL_PCI_UNCORE_EDC_UCLK,
-       KNL_PCI_UNCORE_EDC_ECLK,
-       KNL_PCI_UNCORE_M2PCIE,
-       KNL_PCI_UNCORE_IRP,
-};
-
-static struct intel_uncore_type *knl_pci_uncores[] = {
-       [KNL_PCI_UNCORE_MC_UCLK]        = &knl_uncore_imc_uclk,
-       [KNL_PCI_UNCORE_MC_DCLK]        = &knl_uncore_imc_dclk,
-       [KNL_PCI_UNCORE_EDC_UCLK]       = &knl_uncore_edc_uclk,
-       [KNL_PCI_UNCORE_EDC_ECLK]       = &knl_uncore_edc_eclk,
-       [KNL_PCI_UNCORE_M2PCIE]         = &knl_uncore_m2pcie,
-       [KNL_PCI_UNCORE_IRP]            = &knl_uncore_irp,
-       NULL,
-};
-
-/*
- * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
- * device type. prior to KNL, each instance of a PMU device type had a unique
- * device ID.
- *
- *     PCI Device ID   Uncore PMU Devices
- *     ----------------------------------
- *     0x7841          MC0 UClk, MC1 UClk
- *     0x7843          MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
- *                     MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
- *     0x7833          EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
- *                     EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
- *     0x7835          EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
- *                     EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
- *     0x7817          M2PCIe
- *     0x7814          IRP
-*/
-
-static const struct pci_device_id knl_uncore_pci_ids[] = {
-       { /* MC UClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
-       },
-       { /* MC DClk Channel */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
-       },
-       { /* EDC UClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
-       },
-       { /* EDC EClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
-       },
-       { /* M2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver knl_uncore_pci_driver = {
-       .name           = "knl_uncore",
-       .id_table       = knl_uncore_pci_ids,
-};
-
-int knl_uncore_pci_init(void)
-{
-       int ret;
-
-       /* All KNL PCI based PMON units are on the same PCI bus except IRP */
-       ret = snb_pci2phy_map_init(0x7814); /* IRP */
-       if (ret)
-               return ret;
-       ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
-       if (ret)
-               return ret;
-       uncore_pci_uncores = knl_pci_uncores;
-       uncore_pci_driver = &knl_uncore_pci_driver;
-       return 0;
-}
-
-/* end of KNL uncore support */
-
-/* Haswell-EP uncore support */
-static struct attribute *hswep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_filter_tid2.attr,
-       &format_attr_filter_cid.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_ubox_formats_attr,
-};
-
-static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       reg1->reg = HSWEP_U_MSR_PMON_FILTER;
-       reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
-       reg1->idx = 0;
-       return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_ubox_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = hswep_ubox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 44,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &hswep_uncore_ubox_ops,
-       .format_group           = &hswep_uncore_ubox_format_group,
-};
-
-static struct attribute *hswep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid3.attr,
-       &format_attr_filter_link2.attr,
-       &format_attr_filter_state3.attr,
-       &format_attr_filter_nid2.attr,
-       &format_attr_filter_opc2.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_c6.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_cbox_formats_attr,
-};
-
-static struct event_constraint hswep_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-       EVENT_EXTRA_END
-};
-
-static u64 hswep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-       if (fields & 0x1)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-       if (fields & 0x4)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x10) {
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-       }
-       return mask;
-}
-
-static struct event_constraint *
-hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
-}
-
-static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-                                 struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               u64 filter = uncore_shared_reg_config(box, 0);
-               wrmsrl(reg1->reg, filter & 0xffffffff);
-               wrmsrl(reg1->reg + 1, filter >> 32);
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops hswep_uncore_cbox_ops = {
-       .init_box               = snbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = hswep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = hswep_cbox_hw_config,
-       .get_constraint         = hswep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 18,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = hswep_uncore_cbox_constraints,
-       .ops                    = &hswep_uncore_cbox_ops,
-       .format_group           = &hswep_uncore_cbox_format_group,
-};
-
-/*
- * Write SBOX Initialization register bit by bit to avoid spurious #GPs
- */
-static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-
-       if (msr) {
-               u64 init = SNBEP_PMON_BOX_CTL_INT;
-               u64 flags = 0;
-               int i;
-
-               for_each_set_bit(i, (unsigned long *)&init, 64) {
-                       flags |= (1ULL << i);
-                       wrmsrl(msr, flags);
-               }
-       }
-}
-
-static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
-       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .init_box               = hswep_uncore_sbox_msr_init_box
-};
-
-static struct attribute *hswep_uncore_sbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_sbox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_type hswep_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 4,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
-       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
-       .ops                    = &hswep_uncore_sbox_msr_ops,
-       .format_group           = &hswep_uncore_sbox_format_group,
-};
-
-static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-       if (ev_sel >= 0xb && ev_sel <= 0xe) {
-               reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
-               reg1->idx = ev_sel - 0xb;
-               reg1->config = event->attr.config1 & (0xff << reg1->idx);
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_pcu_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = hswep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &hswep_uncore_pcu_ops,
-       .format_group           = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *hswep_msr_uncores[] = {
-       &hswep_uncore_ubox,
-       &hswep_uncore_cbox,
-       &hswep_uncore_sbox,
-       &hswep_uncore_pcu,
-       NULL,
-};
-
-void hswep_uncore_cpu_init(void)
-{
-       if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-
-       /* Detect 6-8 core systems with only two SBOXes */
-       if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
-               u32 capid4;
-
-               pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
-                                     0x94, &capid4);
-               if (((capid4 >> 6) & 0x3) == 0)
-                       hswep_uncore_sbox.num_boxes = 2;
-       }
-
-       uncore_msr_uncores = hswep_msr_uncores;
-}
-
-static struct intel_uncore_type hswep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 5,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct uncore_event_desc hswep_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type hswep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 5,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = hswep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
-
-static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static struct intel_uncore_ops hswep_uncore_irp_ops = {
-       .init_box       = snbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = ivbep_uncore_irp_disable_event,
-       .enable_event   = ivbep_uncore_irp_enable_event,
-       .read_counter   = hswep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type hswep_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &hswep_uncore_irp_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type hswep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 5,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = hswep_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 4,
-       .num_boxes      = 3,
-       .perf_ctr_bits  = 44,
-       .constraints    = hswep_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       HSWEP_PCI_UNCORE_HA,
-       HSWEP_PCI_UNCORE_IMC,
-       HSWEP_PCI_UNCORE_IRP,
-       HSWEP_PCI_UNCORE_QPI,
-       HSWEP_PCI_UNCORE_R2PCIE,
-       HSWEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *hswep_pci_uncores[] = {
-       [HSWEP_PCI_UNCORE_HA]   = &hswep_uncore_ha,
-       [HSWEP_PCI_UNCORE_IMC]  = &hswep_uncore_imc,
-       [HSWEP_PCI_UNCORE_IRP]  = &hswep_uncore_irp,
-       [HSWEP_PCI_UNCORE_QPI]  = &hswep_uncore_qpi,
-       [HSWEP_PCI_UNCORE_R2PCIE]       = &hswep_uncore_r2pcie,
-       [HSWEP_PCI_UNCORE_R3QPI]        = &hswep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id hswep_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 1 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* PCU.3 (for Capability registers) */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  HSWEP_PCI_PCU_3),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver hswep_uncore_pci_driver = {
-       .name           = "hswep_uncore",
-       .id_table       = hswep_uncore_pci_ids,
-};
-
-int hswep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x2f1e);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = hswep_pci_uncores;
-       uncore_pci_driver = &hswep_uncore_pci_driver;
-       return 0;
-}
-/* end of Haswell-EP uncore support */
-
-/* BDX uncore support */
-
-static struct intel_uncore_type bdx_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_msr_ops,
-       .format_group           = &ivbep_uncore_ubox_format_group,
-};
-
-static struct event_constraint bdx_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 24,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = bdx_uncore_cbox_constraints,
-       .ops                    = &hswep_uncore_cbox_ops,
-       .format_group           = &hswep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 4,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
-       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
-       .ops                    = &hswep_uncore_sbox_msr_ops,
-       .format_group           = &hswep_uncore_sbox_format_group,
-};
-
-static struct intel_uncore_type *bdx_msr_uncores[] = {
-       &bdx_uncore_ubox,
-       &bdx_uncore_cbox,
-       &bdx_uncore_sbox,
-       &hswep_uncore_pcu,
-       NULL,
-};
-
-void bdx_uncore_cpu_init(void)
-{
-       if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = bdx_msr_uncores;
-}
-
-static struct intel_uncore_type bdx_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 5,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = hswep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &hswep_uncore_irp_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = bdx_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 3,
-       .perf_ctr_bits  = 48,
-       .constraints    = bdx_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       BDX_PCI_UNCORE_HA,
-       BDX_PCI_UNCORE_IMC,
-       BDX_PCI_UNCORE_IRP,
-       BDX_PCI_UNCORE_QPI,
-       BDX_PCI_UNCORE_R2PCIE,
-       BDX_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *bdx_pci_uncores[] = {
-       [BDX_PCI_UNCORE_HA]     = &bdx_uncore_ha,
-       [BDX_PCI_UNCORE_IMC]    = &bdx_uncore_imc,
-       [BDX_PCI_UNCORE_IRP]    = &bdx_uncore_irp,
-       [BDX_PCI_UNCORE_QPI]    = &bdx_uncore_qpi,
-       [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
-       [BDX_PCI_UNCORE_R3QPI]  = &bdx_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id bdx_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
-       },
-       { /* QPI Port 1 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
-       },
-       { /* QPI Port 2 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver bdx_uncore_pci_driver = {
-       .name           = "bdx_uncore",
-       .id_table       = bdx_uncore_pci_ids,
-};
-
-int bdx_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x6f1e);
-
-       if (ret)
-               return ret;
-       uncore_pci_uncores = bdx_pci_uncores;
-       uncore_pci_driver = &bdx_uncore_pci_driver;
-       return 0;
-}
-
-/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c

deleted file mode 100644 (file)

index 5b0c232..0000000
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Driver for Intel Xeon Phi "Knights Corner" PMU */
-
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/hardirq.h>
-
-#include "perf_event.h"
-
-static const u64 knc_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x002a,
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x0016,
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0028,
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0029,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x0012,
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x002b,
-};
-
-static const u64 __initconst knc_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               /* On Xeon Phi event "0" is a valid DATA_READ          */
-               /*   (L1 Data Cache Reads) Instruction.                */
-               /* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
-               /* bit will always be set in x86_pmu_hw_config().      */
-               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-                                               /* DATA_READ           */
-               [ C(RESULT_MISS)   ] = 0x0003,  /* DATA_READ_MISS      */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE          */
-               [ C(RESULT_MISS)   ] = 0x0004,  /* DATA_WRITE_MISS     */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0011,  /* L1_DATA_PF1         */
-               [ C(RESULT_MISS)   ] = 0x001c,  /* L1_DATA_PF1_MISS    */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ          */
-               [ C(RESULT_MISS)   ] = 0x000e,  /* CODE_CACHE_MISS    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x10cb,  /* L2_READ_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x10cc,  /* L2_WRITE_HIT */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x10fc,  /* L2_DATA_PF2      */
-               [ C(RESULT_MISS)   ] = 0x10fe,  /* L2_DATA_PF2_MISS */
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-                                               /* DATA_READ */
-                                               /* see note on L1 OP_READ */
-               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE */
-               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ */
-               [ C(RESULT_MISS)   ] = 0x000d,  /* CODE_PAGE_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0012,  /* BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x002b,  /* BRANCHES_MISPREDICTED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-
-static u64 knc_pmu_event_map(int hw_event)
-{
-       return knc_perfmon_event_map[hw_event];
-}
-
-static struct event_constraint knc_event_constraints[] =
-{
-       INTEL_EVENT_CONSTRAINT(0xc3, 0x1),      /* HWP_L2HIT */
-       INTEL_EVENT_CONSTRAINT(0xc4, 0x1),      /* HWP_L2MISS */
-       INTEL_EVENT_CONSTRAINT(0xc8, 0x1),      /* L2_READ_HIT_E */
-       INTEL_EVENT_CONSTRAINT(0xc9, 0x1),      /* L2_READ_HIT_M */
-       INTEL_EVENT_CONSTRAINT(0xca, 0x1),      /* L2_READ_HIT_S */
-       INTEL_EVENT_CONSTRAINT(0xcb, 0x1),      /* L2_READ_MISS */
-       INTEL_EVENT_CONSTRAINT(0xcc, 0x1),      /* L2_WRITE_HIT */
-       INTEL_EVENT_CONSTRAINT(0xce, 0x1),      /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
-       INTEL_EVENT_CONSTRAINT(0xcf, 0x1),      /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
-       INTEL_EVENT_CONSTRAINT(0xd7, 0x1),      /* L2_VICTIM_REQ_WITH_DATA */
-       INTEL_EVENT_CONSTRAINT(0xe3, 0x1),      /* SNP_HITM_BUNIT */
-       INTEL_EVENT_CONSTRAINT(0xe6, 0x1),      /* SNP_HIT_L2 */
-       INTEL_EVENT_CONSTRAINT(0xe7, 0x1),      /* SNP_HITM_L2 */
-       INTEL_EVENT_CONSTRAINT(0xf1, 0x1),      /* L2_DATA_READ_MISS_CACHE_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf2, 0x1),      /* L2_DATA_WRITE_MISS_CACHE_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf6, 0x1),      /* L2_DATA_READ_MISS_MEM_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf7, 0x1),      /* L2_DATA_WRITE_MISS_MEM_FILL */
-       INTEL_EVENT_CONSTRAINT(0xfc, 0x1),      /* L2_DATA_PF2 */
-       INTEL_EVENT_CONSTRAINT(0xfd, 0x1),      /* L2_DATA_PF2_DROP */
-       INTEL_EVENT_CONSTRAINT(0xfe, 0x1),      /* L2_DATA_PF2_MISS */
-       INTEL_EVENT_CONSTRAINT(0xff, 0x1),      /* L2_DATA_HIT_INFLIGHT_PF2 */
-       EVENT_CONSTRAINT_END
-};
-
-#define MSR_KNC_IA32_PERF_GLOBAL_STATUS                0x0000002d
-#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL   0x0000002e
-#define MSR_KNC_IA32_PERF_GLOBAL_CTRL          0x0000002f
-
-#define KNC_ENABLE_COUNTER0                    0x00000001
-#define KNC_ENABLE_COUNTER1                    0x00000002
-
-static void knc_pmu_disable_all(void)
-{
-       u64 val;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-       val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static void knc_pmu_enable_all(int added)
-{
-       u64 val;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-       val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static inline void
-knc_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-
-       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static void knc_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
-       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static inline u64 knc_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void knc_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
-}
-
-static int knc_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       int handled = 0;
-       int bit, loops;
-       u64 status;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       knc_pmu_disable_all();
-
-       status = knc_pmu_get_status();
-       if (!status) {
-               knc_pmu_enable_all(0);
-               return handled;
-       }
-
-       loops = 0;
-again:
-       knc_pmu_ack_status(status);
-       if (++loops > 100) {
-               WARN_ONCE(1, "perf: irq loop stuck!\n");
-               perf_event_print_debug();
-               goto done;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-
-       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_event *event = cpuc->events[bit];
-
-               handled++;
-
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = knc_pmu_get_status();
-       if (status)
-               goto again;
-
-done:
-       knc_pmu_enable_all(0);
-
-       return handled;
-}
-
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *intel_knc_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-static const struct x86_pmu knc_pmu __initconst = {
-       .name                   = "knc",
-       .handle_irq             = knc_pmu_handle_irq,
-       .disable_all            = knc_pmu_disable_all,
-       .enable_all             = knc_pmu_enable_all,
-       .enable                 = knc_pmu_enable_event,
-       .disable                = knc_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_KNC_EVNTSEL0,
-       .perfctr                = MSR_KNC_PERFCTR0,
-       .event_map              = knc_pmu_event_map,
-       .max_events             = ARRAY_SIZE(knc_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 39) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       .cntval_bits            = 40,
-       .cntval_mask            = (1ULL << 40) - 1,
-       .get_event_constraints  = x86_get_event_constraints,
-       .event_constraints      = knc_event_constraints,
-       .format_attrs           = intel_knc_formats_attr,
-};
-
-__init int knc_pmu_init(void)
-{
-       x86_pmu = knc_pmu;
-
-       memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
-               sizeof(hw_cache_event_ids));
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c

deleted file mode 100644 (file)

index ec863b9..0000000
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include <linux/perf_event.h>
-
-enum perf_msr_id {
-       PERF_MSR_TSC                    = 0,
-       PERF_MSR_APERF                  = 1,
-       PERF_MSR_MPERF                  = 2,
-       PERF_MSR_PPERF                  = 3,
-       PERF_MSR_SMI                    = 4,
-
-       PERF_MSR_EVENT_MAX,
-};
-
-static bool test_aperfmperf(int idx)
-{
-       return boot_cpu_has(X86_FEATURE_APERFMPERF);
-}
-
-static bool test_intel(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_MSR_SMI)
-                       return true;
-               break;
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-struct perf_msr {
-       u64     msr;
-       struct  perf_pmu_events_attr *attr;
-       bool    (*test)(int idx);
-};
-
-PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
-
-static struct perf_msr msr[] = {
-       [PERF_MSR_TSC]   = { 0,                 &evattr_tsc,    NULL,            },
-       [PERF_MSR_APERF] = { MSR_IA32_APERF,    &evattr_aperf,  test_aperfmperf, },
-       [PERF_MSR_MPERF] = { MSR_IA32_MPERF,    &evattr_mperf,  test_aperfmperf, },
-       [PERF_MSR_PPERF] = { MSR_PPERF,         &evattr_pperf,  test_intel,      },
-       [PERF_MSR_SMI]   = { MSR_SMI_COUNT,     &evattr_smi,    test_intel,      },
-};
-
-static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group events_attr_group = {
-       .name = "events",
-       .attrs = events_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-static struct attribute *format_attrs[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-static struct attribute_group format_attr_group = {
-       .name = "format",
-       .attrs = format_attrs,
-};
-
-static const struct attribute_group *attr_groups[] = {
-       &events_attr_group,
-       &format_attr_group,
-       NULL,
-};
-
-static int msr_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       if (cfg >= PERF_MSR_EVENT_MAX)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       if (!msr[cfg].attr)
-               return -EINVAL;
-
-       event->hw.idx = -1;
-       event->hw.event_base = msr[cfg].msr;
-       event->hw.config = cfg;
-
-       return 0;
-}
-
-static inline u64 msr_read_counter(struct perf_event *event)
-{
-       u64 now;
-
-       if (event->hw.event_base)
-               rdmsrl(event->hw.event_base, now);
-       else
-               rdtscll(now);
-
-       return now;
-}
-static void msr_event_update(struct perf_event *event)
-{
-       u64 prev, now;
-       s64 delta;
-
-       /* Careful, an NMI might modify the previous event value. */
-again:
-       prev = local64_read(&event->hw.prev_count);
-       now = msr_read_counter(event);
-
-       if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
-               goto again;
-
-       delta = now - prev;
-       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
-               delta = sign_extend64(delta, 31);
-
-       local64_add(now - prev, &event->count);
-}
-
-static void msr_event_start(struct perf_event *event, int flags)
-{
-       u64 now;
-
-       now = msr_read_counter(event);
-       local64_set(&event->hw.prev_count, now);
-}
-
-static void msr_event_stop(struct perf_event *event, int flags)
-{
-       msr_event_update(event);
-}
-
-static void msr_event_del(struct perf_event *event, int flags)
-{
-       msr_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int msr_event_add(struct perf_event *event, int flags)
-{
-       if (flags & PERF_EF_START)
-               msr_event_start(event, flags);
-
-       return 0;
-}
-
-static struct pmu pmu_msr = {
-       .task_ctx_nr    = perf_sw_context,
-       .attr_groups    = attr_groups,
-       .event_init     = msr_event_init,
-       .add            = msr_event_add,
-       .del            = msr_event_del,
-       .start          = msr_event_start,
-       .stop           = msr_event_stop,
-       .read           = msr_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static int __init msr_init(void)
-{
-       int i, j = 0;
-
-       if (!boot_cpu_has(X86_FEATURE_TSC)) {
-               pr_cont("no MSR PMU driver.\n");
-               return 0;
-       }
-
-       /* Probe the MSRs. */
-       for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
-               u64 val;
-
-               /*
-                * Virt sucks arse; you cannot tell if a R/O MSR is present :/
-                */
-               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-                       msr[i].attr = NULL;
-       }
-
-       /* List remaining MSRs in the sysfs attrs. */
-       for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
-               if (msr[i].attr)
-                       events_attrs[j++] = &msr[i].attr->attr.attr;
-       }
-       events_attrs[j] = NULL;
-
-       perf_pmu_register(&pmu_msr, "msr", -1);
-
-       return 0;
-}
-device_initcall(msr_init);
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c

deleted file mode 100644 (file)

index f2e5678..0000000
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ /dev/null
@@ -1,1376 +0,0 @@
-/*
- * Netburst Performance Events (P4, old Xeon)
- *
- *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#include <asm/perf_event_p4.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
-       unsigned int opcode;                    /* Event code and ESCR selector */
-       unsigned int escr_msr[2];               /* ESCR MSR for this event */
-       unsigned int escr_emask;                /* valid ESCR EventMask bits */
-       unsigned int shared;                    /* event is shared across threads */
-       char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
-};
-
-struct p4_pebs_bind {
-       unsigned int metric_pebs;
-       unsigned int metric_vert;
-};
-
-/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-#define P4_GEN_PEBS_BIND(name, pebs, vert)                     \
-       [P4_PEBS_METRIC__##name] = {                            \
-               .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
-               .metric_vert = vert,                            \
-       }
-
-/*
- * note we have P4_PEBS_ENABLE_UOP_TAG always set here
- *
- * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
- * event configuration to find out which values are to be
- * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
- */
-static struct p4_pebs_bind p4_pebs_bind_map[] = {
-       P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
-       P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
-       P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
-       P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
-       P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
-       P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
-       P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
-       P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
-       P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
-       [P4_EVENT_TC_DELIVER_MODE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
-               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
-               .shared         = 1,
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_BPU_FETCH_REQUEST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
-               .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_ITLB_REFERENCE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
-               .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_MEMORY_CANCEL] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
-               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_MEMORY_COMPLETE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_LOAD_PORT_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_STORE_PORT_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_MOB_LOAD_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
-               .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_PAGE_WALK_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
-               .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
-               .shared         = 1,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BSQ_CACHE_REFERENCE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
-               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_IOQ_ALLOCATION] = {
-               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
-               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
-               .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
-               .cntr           = { {2, -1, -1}, {3, -1, -1} },
-       },
-       [P4_EVENT_FSB_DATA_ACTIVITY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
-               .shared         = 1,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
-               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
-               .cntr           = { {0, -1, -1}, {1, -1, -1} },
-       },
-       [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
-               .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
-               .cntr           = { {2, -1, -1}, {3, -1, -1} },
-       },
-       [P4_EVENT_SSE_INPUT_ASSIST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_PACKED_SP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_PACKED_DP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_SCALAR_SP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_SCALAR_DP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_64BIT_MMX_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_128BIT_MMX_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_X87_FP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_TC_MISC] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
-               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_GLOBAL_POWER_EVENTS] = {
-               .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_TC_MS_XFER] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
-               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_UOP_QUEUE_WRITES] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
-               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
-               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RETIRED_BRANCH_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
-               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RESOURCE_STALL] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
-               .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_WC_BUFFER] = {
-               .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
-               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_B2B_CYCLES] = {
-               .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BNR] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BNR),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_SNOOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_RESPONSE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_FRONT_END_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_EXECUTION_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_REPLAY_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_INSTR_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_UOPS_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_UOP_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
-               .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_BRANCH_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_X87_ASSIST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_MACHINE_CLEAR] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_INSTR_COMPLETED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, metric)                           \
-       p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
-                           P4_ESCR_EMASK_BIT(event, bit))              | \
-       p4_config_pack_cccr(metric                                      | \
-                           P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__1stl_cache_load_miss_retired),
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
-       },
-},
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__dtlb_load_miss_retired),
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__dtlb_store_miss_retired),
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-                                               P4_PEBS_METRIC__none),
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-                                               P4_PEBS_METRIC__none),
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * Because of Netburst being quite restricted in how many
- * identical events may run simultaneously, we introduce event aliases,
- * ie the different events which have the same functionality but
- * utilize non-intersected resources (ESCR/CCCR/counter registers).
- *
- * This allow us to relax restrictions a bit and run two or more
- * identical events together.
- *
- * Never set any custom internal bits such as P4_CONFIG_HT,
- * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
- * either up to date automatically or not applicable at all.
- */
-struct p4_event_alias {
-       u64 original;
-       u64 alternative;
-} p4_event_aliases[] = {
-       {
-               /*
-                * Non-halted cycles can be substituted with non-sleeping cycles (see
-                * Intel SDM Vol3b for details). We need this alias to be able
-                * to run nmi-watchdog and 'perf top' (or any other user space tool
-                * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
-                * simultaneously.
-                */
-       .original       =
-               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
-       .alternative    =
-               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
-               p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
-                                   P4_CCCR_COMPARE),
-       },
-};
-
-static u64 p4_get_alias_event(u64 config)
-{
-       u64 config_match;
-       int i;
-
-       /*
-        * Only event with special mark is allowed,
-        * we're to be sure it didn't come as malformed
-        * RAW event.
-        */
-       if (!(config & P4_CONFIG_ALIASABLE))
-               return 0;
-
-       config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
-
-       for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
-               if (config_match == p4_event_aliases[i].original) {
-                       config_match = p4_event_aliases[i].alternative;
-                       break;
-               } else if (config_match == p4_event_aliases[i].alternative) {
-                       config_match = p4_event_aliases[i].original;
-                       break;
-               }
-       }
-
-       if (i >= ARRAY_SIZE(p4_event_aliases))
-               return 0;
-
-       return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
-}
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
-  /* non-halted CPU clocks */
-  [PERF_COUNT_HW_CPU_CYCLES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
-               P4_CONFIG_ALIASABLE,
-
-  /*
-   * retired instructions
-   * in a sake of simplicity we don't use the FSB tagging
-   */
-  [PERF_COUNT_HW_INSTRUCTIONS] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
-               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
-  /* cache hits */
-  [PERF_COUNT_HW_CACHE_REFERENCES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
-  /* cache misses */
-  [PERF_COUNT_HW_CACHE_MISSES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
-  /* branch instructions retired */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
-  /* mispredicted branches retired */
-  [PERF_COUNT_HW_BRANCH_MISSES]        =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
-               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
-  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-  [PERF_COUNT_HW_BUS_CYCLES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
-       p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
-       unsigned int evnt = p4_config_unpack_event(config);
-       struct p4_event_bind *bind = NULL;
-
-       if (evnt < ARRAY_SIZE(p4_event_bind_map))
-               bind = &p4_event_bind_map[evnt];
-
-       return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
-       struct p4_event_bind *bind;
-       unsigned int esel;
-       u64 config;
-
-       config = p4_general_events[hw_event];
-       bind = p4_config_get_bind(config);
-       esel = P4_OPCODE_ESEL(bind->opcode);
-       config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
-       return config;
-}
-
-/* check cpu model specifics */
-static bool p4_event_match_cpu_model(unsigned int event_idx)
-{
-       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
-       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
-               if (boot_cpu_data.x86_model != 3 &&
-                       boot_cpu_data.x86_model != 4 &&
-                       boot_cpu_data.x86_model != 6)
-                       return false;
-       }
-
-       /*
-        * For info
-        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
-        */
-
-       return true;
-}
-
-static int p4_validate_raw_event(struct perf_event *event)
-{
-       unsigned int v, emask;
-
-       /* User data may have out-of-bound event index */
-       v = p4_config_unpack_event(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_event_bind_map))
-               return -EINVAL;
-
-       /* It may be unsupported: */
-       if (!p4_event_match_cpu_model(v))
-               return -EINVAL;
-
-       /*
-        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
-        * in Architectural Performance Monitoring, it means not
-        * on _which_ logical cpu to count but rather _when_, ie it
-        * depends on logical cpu state -- count event if one cpu active,
-        * none, both or any, so we just allow user to pass any value
-        * desired.
-        *
-        * In turn we always set Tx_OS/Tx_USR bits bound to logical
-        * cpu without their propagation to another cpu
-        */
-
-       /*
-        * if an event is shared across the logical threads
-        * the user needs special permissions to be able to use it
-        */
-       if (p4_ht_active() && p4_event_bind_map[v].shared) {
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                       return -EACCES;
-       }
-
-       /* ESCR EventMask bits may be invalid */
-       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
-       if (emask & ~p4_event_bind_map[v].escr_emask)
-               return -EINVAL;
-
-       /*
-        * it may have some invalid PEBS bits
-        */
-       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
-               return -EINVAL;
-
-       v = p4_config_unpack_metric(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
-               return -EINVAL;
-
-       return 0;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
-       int cpu = get_cpu();
-       int rc = 0;
-       u32 escr, cccr;
-
-       /*
-        * the reason we use cpu that early is that: if we get scheduled
-        * first time on the same cpu -- we will not need swap thread
-        * specific flags in config (and will save some cpu cycles)
-        */
-
-       cccr = p4_default_cccr_conf(cpu);
-       escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
-                                        event->attr.exclude_user);
-       event->hw.config = p4_config_pack_escr(escr) |
-                          p4_config_pack_cccr(cccr);
-
-       if (p4_ht_active() && p4_ht_thread(cpu))
-               event->hw.config = p4_set_ht_bit(event->hw.config);
-
-       if (event->attr.type == PERF_TYPE_RAW) {
-               struct p4_event_bind *bind;
-               unsigned int esel;
-               /*
-                * Clear bits we reserve to be managed by kernel itself
-                * and never allowed from a user space
-                */
-                event->attr.config &= P4_CONFIG_MASK;
-
-               rc = p4_validate_raw_event(event);
-               if (rc)
-                       goto out;
-
-               /*
-                * Note that for RAW events we allow user to use P4_CCCR_RESERVED
-                * bits since we keep additional info here (for cache events and etc)
-                */
-               event->hw.config |= event->attr.config;
-               bind = p4_config_get_bind(event->attr.config);
-               if (!bind) {
-                       rc = -EINVAL;
-                       goto out;
-               }
-               esel = P4_OPCODE_ESEL(bind->opcode);
-               event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-       }
-
-       rc = x86_setup_perfctr(event);
-out:
-       put_cpu();
-       return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
-       u64 v;
-
-       /* an official way for overflow indication */
-       rdmsrl(hwc->config_base, v);
-       if (v & P4_CCCR_OVF) {
-               wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
-               return 1;
-       }
-
-       /*
-        * In some circumstances the overflow might issue an NMI but did
-        * not set P4_CCCR_OVF bit. Because a counter holds a negative value
-        * we simply check for high bit being set, if it's cleared it means
-        * the counter has reached zero value and continued counting before
-        * real NMI signal was received:
-        */
-       rdmsrl(hwc->event_base, v);
-       if (!(v & ARCH_P4_UNFLAGGED_BIT))
-               return 1;
-
-       return 0;
-}
-
-static void p4_pmu_disable_pebs(void)
-{
-       /*
-        * FIXME
-        *
-        * It's still allowed that two threads setup same cache
-        * events so we can't simply clear metrics until we knew
-        * no one is depending on us, so we need kind of counter
-        * for "ReplayEvent" users.
-        *
-        * What is more complex -- RAW events, if user (for some
-        * reason) will pass some cache event metric with improper
-        * event opcode -- it's fine from hardware point of view
-        * but completely nonsense from "meaning" of such action.
-        *
-        * So at moment let leave metrics turned on forever -- it's
-        * ok for now but need to be revisited!
-        *
-        * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
-        * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
-        */
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       /*
-        * If event gets disabled while counter is in overflowed
-        * state we need to clear P4_CCCR_OVF, otherwise interrupt get
-        * asserted again and again
-        */
-       (void)wrmsrl_safe(hwc->config_base,
-               p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_event *event = cpuc->events[idx];
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               p4_pmu_disable_event(event);
-       }
-
-       p4_pmu_disable_pebs();
-}
-
-/* configuration must be valid */
-static void p4_pmu_enable_pebs(u64 config)
-{
-       struct p4_pebs_bind *bind;
-       unsigned int idx;
-
-       BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
-
-       idx = p4_config_unpack_metric(config);
-       if (idx == P4_PEBS_METRIC__none)
-               return;
-
-       bind = &p4_pebs_bind_map[idx];
-
-       (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
-       (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,      (u64)bind->metric_vert);
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int thread = p4_ht_config_thread(hwc->config);
-       u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-       unsigned int idx = p4_config_unpack_event(hwc->config);
-       struct p4_event_bind *bind;
-       u64 escr_addr, cccr;
-
-       bind = &p4_event_bind_map[idx];
-       escr_addr = bind->escr_msr[thread];
-
-       /*
-        * - we dont support cascaded counters yet
-        * - and counter 1 is broken (erratum)
-        */
-       WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
-       WARN_ON_ONCE(hwc->idx == 1);
-
-       /* we need a real Event value */
-       escr_conf &= ~P4_ESCR_EVENT_MASK;
-       escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
-       cccr = p4_config_unpack_cccr(hwc->config);
-
-       /*
-        * it could be Cache event so we need to write metrics
-        * into additional MSRs
-        */
-       p4_pmu_enable_pebs(hwc->config);
-
-       (void)wrmsrl_safe(escr_addr, escr_conf);
-       (void)wrmsrl_safe(hwc->config_base,
-                               (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_event *event = cpuc->events[idx];
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               p4_pmu_enable_event(event);
-       }
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       struct perf_event *event;
-       struct hw_perf_event *hwc;
-       int idx, handled = 0;
-       u64 val;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               int overflow;
-
-               if (!test_bit(idx, cpuc->active_mask)) {
-                       /* catch in-flight IRQs */
-                       if (__test_and_clear_bit(idx, cpuc->running))
-                               handled++;
-                       continue;
-               }
-
-               event = cpuc->events[idx];
-               hwc = &event->hw;
-
-               WARN_ON_ONCE(hwc->idx != idx);
-
-               /* it might be unflagged overflow */
-               overflow = p4_pmu_clear_cccr_ovf(hwc);
-
-               val = x86_perf_event_update(event);
-               if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
-                       continue;
-
-               handled += overflow;
-
-               /* event overflow for sure */
-               perf_sample_data_init(&data, 0, hwc->last_period);
-
-               if (!x86_perf_event_set_period(event))
-                       continue;
-
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       /*
-        * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
-        * been observed that the OVF bit flag has to be cleared first _before_
-        * the LVTPC can be unmasked.
-        *
-        * The reason is the NMI line will continue to be asserted while the OVF
-        * bit is set.  This causes a second NMI to generate if the LVTPC is
-        * unmasked before the OVF bit is cleared, leading to unknown NMI
-        * messages.
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
-       u32 escr, cccr;
-
-       /*
-        * we either lucky and continue on same cpu or no HT support
-        */
-       if (!p4_should_swap_ts(hwc->config, cpu))
-               return;
-
-       /*
-        * the event is migrated from an another logical
-        * cpu, so we need to swap thread specific flags
-        */
-
-       escr = p4_config_unpack_escr(hwc->config);
-       cccr = p4_config_unpack_cccr(hwc->config);
-
-       if (p4_ht_thread(cpu)) {
-               cccr &= ~P4_CCCR_OVF_PMI_T0;
-               cccr |= P4_CCCR_OVF_PMI_T1;
-               if (escr & P4_ESCR_T0_OS) {
-                       escr &= ~P4_ESCR_T0_OS;
-                       escr |= P4_ESCR_T1_OS;
-               }
-               if (escr & P4_ESCR_T0_USR) {
-                       escr &= ~P4_ESCR_T0_USR;
-                       escr |= P4_ESCR_T1_USR;
-               }
-               hwc->config  = p4_config_pack_escr(escr);
-               hwc->config |= p4_config_pack_cccr(cccr);
-               hwc->config |= P4_CONFIG_HT;
-       } else {
-               cccr &= ~P4_CCCR_OVF_PMI_T1;
-               cccr |= P4_CCCR_OVF_PMI_T0;
-               if (escr & P4_ESCR_T1_OS) {
-                       escr &= ~P4_ESCR_T1_OS;
-                       escr |= P4_ESCR_T0_OS;
-               }
-               if (escr & P4_ESCR_T1_USR) {
-                       escr &= ~P4_ESCR_T1_USR;
-                       escr |= P4_ESCR_T0_USR;
-               }
-               hwc->config  = p4_config_pack_escr(escr);
-               hwc->config |= p4_config_pack_cccr(cccr);
-               hwc->config &= ~P4_CONFIG_HT;
-       }
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE               0x000003a0
-#define P4_ESCR_MSR_MAX                        0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE         (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr)           (msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr)   [P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
-       unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
-       if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE      ||
-                       !p4_escr_table[idx]             ||
-                       p4_escr_table[idx] != addr)) {
-               WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
-               return -1;
-       }
-
-       return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
-                       struct p4_event_bind *bind)
-{
-       int i, j;
-
-       for (i = 0; i < P4_CNTR_LIMIT; i++) {
-               j = bind->cntr[thread][i];
-               if (j != -1 && !test_bit(j, used_mask))
-                       return j;
-       }
-
-       return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-       int cpu = smp_processor_id();
-       struct hw_perf_event *hwc;
-       struct p4_event_bind *bind;
-       unsigned int i, thread, num;
-       int cntr_idx, escr_idx;
-       u64 config_alias;
-       int pass;
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-       bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
-       for (i = 0, num = n; i < n; i++, num--) {
-
-               hwc = &cpuc->event_list[i]->hw;
-               thread = p4_ht_thread(cpu);
-               pass = 0;
-
-again:
-               /*
-                * It's possible to hit a circular lock
-                * between original and alternative events
-                * if both are scheduled already.
-                */
-               if (pass > 2)
-                       goto done;
-
-               bind = p4_config_get_bind(hwc->config);
-               escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
-               if (unlikely(escr_idx == -1))
-                       goto done;
-
-               if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
-                       cntr_idx = hwc->idx;
-                       if (assign)
-                               assign[i] = hwc->idx;
-                       goto reserve;
-               }
-
-               cntr_idx = p4_next_cntr(thread, used_mask, bind);
-               if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-                       /*
-                        * Check whether an event alias is still available.
-                        */
-                       config_alias = p4_get_alias_event(hwc->config);
-                       if (!config_alias)
-                               goto done;
-                       hwc->config = config_alias;
-                       pass++;
-                       goto again;
-               }
-               /*
-                * Perf does test runs to see if a whole group can be assigned
-                * together succesfully.  There can be multiple rounds of this.
-                * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
-                * bits, such that the next round of group assignments will
-                * cause the above p4_should_swap_ts to pass instead of fail.
-                * This leads to counters exclusive to thread0 being used by
-                * thread1.
-                *
-                * Solve this with a cheap hack, reset the idx back to -1 to
-                * force a new lookup (p4_next_cntr) to get the right counter
-                * for the right thread.
-                *
-                * This probably doesn't comply with the general spirit of how
-                * perf wants to work, but P4 is special. :-(
-                */
-               if (p4_should_swap_ts(hwc->config, cpu))
-                       hwc->idx = -1;
-               p4_pmu_swap_config_ts(hwc, cpu);
-               if (assign)
-                       assign[i] = cntr_idx;
-reserve:
-               set_bit(cntr_idx, used_mask);
-               set_bit(escr_idx, escr_mask);
-       }
-
-done:
-       return num ? -EINVAL : 0;
-}
-
-PMU_FORMAT_ATTR(cccr, "config:0-31" );
-PMU_FORMAT_ATTR(escr, "config:32-62");
-PMU_FORMAT_ATTR(ht,   "config:63"   );
-
-static struct attribute *intel_p4_formats_attr[] = {
-       &format_attr_cccr.attr,
-       &format_attr_escr.attr,
-       &format_attr_ht.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu p4_pmu = {
-       .name                   = "Netburst P4/Xeon",
-       .handle_irq             = p4_pmu_handle_irq,
-       .disable_all            = p4_pmu_disable_all,
-       .enable_all             = p4_pmu_enable_all,
-       .enable                 = p4_pmu_enable_event,
-       .disable                = p4_pmu_disable_event,
-       .eventsel               = MSR_P4_BPU_CCCR0,
-       .perfctr                = MSR_P4_BPU_PERFCTR0,
-       .event_map              = p4_pmu_event_map,
-       .max_events             = ARRAY_SIZE(p4_general_events),
-       .get_event_constraints  = x86_get_event_constraints,
-       /*
-        * IF HT disabled we may need to use all
-        * ARCH_P4_MAX_CCCR counters simulaneously
-        * though leave it restricted at moment assuming
-        * HT is on
-        */
-       .num_counters           = ARCH_P4_MAX_CCCR,
-       .apic                   = 1,
-       .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
-       .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
-       .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
-       .hw_config              = p4_hw_config,
-       .schedule_events        = p4_pmu_schedule_events,
-       /*
-        * This handles erratum N15 in intel doc 249199-029,
-        * the counter may not be updated correctly on write
-        * so we need a second write operation to do the trick
-        * (the official workaround didn't work)
-        *
-        * the former idea is taken from OProfile code
-        */
-       .perfctr_second_write   = 1,
-
-       .format_attrs           = intel_p4_formats_attr,
-};
-
-__init int p4_pmu_init(void)
-{
-       unsigned int low, high;
-       int i, reg;
-
-       /* If we get stripped -- indexing fails */
-       BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
-
-       rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-       if (!(low & (1 << 7))) {
-               pr_cont("unsupported Netburst CPU model %d ",
-                       boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-               sizeof(hw_cache_event_ids));
-
-       pr_cont("Netburst events, ");
-
-       x86_pmu = p4_pmu;
-
-       /*
-        * Even though the counters are configured to interrupt a particular
-        * logical processor when an overflow happens, testing has shown that
-        * on kdump kernels (which uses a single cpu), thread1's counter
-        * continues to run and will report an NMI on thread0.  Due to the
-        * overflow bug, this leads to a stream of unknown NMIs.
-        *
-        * Solve this by zero'ing out the registers to mimic a reset.
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               reg = x86_pmu_config_addr(i);
-               wrmsrl_safe(reg, 0ULL);
-       }
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c

deleted file mode 100644 (file)

index 7c1a0c0..0000000
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include "perf_event.h"
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,       /* CPU_CLK_UNHALTED */
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,       /* INST_RETIRED     */
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,       /* L2_RQSTS:M:E:S:I */
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,       /* L2_RQSTS:I       */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,       /* BR_INST_RETIRED  */
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,       /* BR_MISS_PRED_RETIRED */
-  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,       /* BUS_DRDY_CLOCKS  */
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,    /* RESOURCE_STALLS  */
-
-};
-
-static const u64 __initconst p6_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS       */
-                [ C(RESULT_MISS)   ] = 0x0045, /* DCU_LINES_IN        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0f29,  /* L2_LD:M:E:S:I       */
-       },
-        [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-        },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
-               [ C(RESULT_MISS)   ] = 0x0f28,  /* L2_IFETCH:M:E:S:I  */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0025,  /* L2_M_LINES_INM     */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
-               [ C(RESULT_MISS)   ] = 0x0085,  /* ITLB_MISS          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4,  /* BR_INST_RETIRED      */
-               [ C(RESULT_MISS)   ] = 0x00c5,  /* BR_MISS_PRED_RETIRED */
-        },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-       return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT                   0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
-       INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
-       INTEL_EVENT_CONSTRAINT(0x10, 0x1),      /* FP_COMP_OPS_EXE */
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2),      /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2),      /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2),      /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1),      /* CYCLES_DIV_BUSY */
-       EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
-       u64 val;
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
-       unsigned long val;
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val = P6_NOP_EVENT;
-
-       (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-
-       /*
-        * p6 only has a global event enable, set on PerfEvtSel0
-        * We "disable" events by programming P6_NOP_EVENT
-        * and we rely on p6_pmu_enable_all() being called
-        * to actually enable the events.
-        */
-
-       (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(pc,    "config:19"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *intel_p6_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu p6_pmu = {
-       .name                   = "p6",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = p6_pmu_disable_all,
-       .enable_all             = p6_pmu_enable_all,
-       .enable                 = p6_pmu_enable_event,
-       .disable                = p6_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_P6_EVNTSEL0,
-       .perfctr                = MSR_P6_PERFCTR0,
-       .event_map              = p6_pmu_event_map,
-       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 31) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       /*
-        * Events have 40 bits implemented. However they are designed such
-        * that bits [32-39] are sign extensions of bit 31. As such the
-        * effective width of a event for P6-like PMU is 32 bits only.
-        *
-        * See IA-32 Intel Architecture Software developer manual Vol 3B
-        */
-       .cntval_bits            = 32,
-       .cntval_mask            = (1ULL << 32) - 1,
-       .get_event_constraints  = x86_get_event_constraints,
-       .event_constraints      = p6_event_constraints,
-
-       .format_attrs           = intel_p6_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-};
-
-static __init void p6_pmu_rdpmc_quirk(void)
-{
-       if (boot_cpu_data.x86_mask < 9) {
-               /*
-                * PPro erratum 26; fixed in stepping 9 and above.
-                */
-               pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
-               x86_pmu.attr_rdpmc_broken = 1;
-               x86_pmu.attr_rdpmc = 0;
-       }
-}
-
-__init int p6_pmu_init(void)
-{
-       x86_pmu = p6_pmu;
-
-       switch (boot_cpu_data.x86_model) {
-       case  1: /* Pentium Pro */
-               x86_add_quirk(p6_pmu_rdpmc_quirk);
-               break;
-
-       case  3: /* Pentium II - Klamath */
-       case  5: /* Pentium II - Deschutes */
-       case  6: /* Pentium II - Mendocino */
-               break;
-
-       case  7: /* Pentium III - Katmai */
-       case  8: /* Pentium III - Coppermine */
-       case 10: /* Pentium III Xeon */
-       case 11: /* Pentium III - Tualatin */
-               break;
-
-       case  9: /* Pentium M - Banias */
-       case 13: /* Pentium M - Dothan */
-               break;
-
-       default:
-               pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
-               sizeof(hw_cache_event_ids));
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c

index 819d94982e078b8597f084f8409fcd3106361293..f6f50c4ceaeceef16170d14d6d55376823bbd63f 100644 (file)
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
         for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
                 if (!rdrand_long(&tmp)) {
                         clear_cpu_cap(c, X86_FEATURE_RDRAND);
-                       printk_once(KERN_WARNING "rdrand: disabled\n");
+                       pr_warn_once("rdrand: disabled\n");
                         return;
                 }
         }
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c

index 4c60eaf0571c2fb1a6d73258861398fc6bcc963e..cd531355e8386b4177d4dcecaf1031ba8c2086c8 100644 (file)
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
         c->x86_max_cores = (core_level_siblings / smp_num_siblings);
  
         if (!printed) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+               pr_info("CPU: Physical Processor ID: %d\n",
                        c->phys_proc_id);
                 if (c->x86_max_cores > 1)
-                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                       pr_info("CPU: Processor Core ID: %d\n",
                                c->cpu_core_id);
                 printed = 1;
         }
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c

index 252da7aceca67ff580e8189ed267b2ce44d83373..34178564be2a70adbeaf3fd890cac32d97a51635 100644 (file)
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,6 @@
  #include <linux/kernel.h>
  #include <linux/mm.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/msr.h>
  #include "cpu.h"
  
@@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
         if (max >= 0x80860001) {
                 cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
                 if (cpu_rev != 0x02000000) {
-                       printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+                       pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
                                 (cpu_rev >> 24) & 0xff,
                                 (cpu_rev >> 16) & 0xff,
                                 (cpu_rev >> 8) & 0xff,
@@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c)
         if (max >= 0x80860002) {
                 cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
                 if (cpu_rev == 0x02000000) {
-                       printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+                       pr_info("CPU: Processor revision %08X, %u MHz\n",
                                 new_cpu_rev, cpu_freq);
                 }
-               printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+               pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
                        (cms_rev1 >> 24) & 0xff,
                        (cms_rev1 >> 16) & 0xff,
                        (cms_rev1 >> 8) & 0xff,
@@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
                       (void *)&cpu_info[56],
                       (void *)&cpu_info[60]);
                 cpu_info[64] = '\0';
-               printk(KERN_INFO "CPU: %s\n", cpu_info);
+               pr_info("CPU: %s\n", cpu_info);
         }
  
         /* Unhide possibly hidden capability flags */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c

index 628a059a9a0663ef7cbdf07e4750c9e1e4f53006..364e5834689753fc7da34c6dc8a34cca22ab92db 100644 (file)
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void)
         tsc_hz = eax | (((uint64_t)ebx) << 32);
         do_div(tsc_hz, 1000);
         BUG_ON(tsc_hz >> 32);
-       printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+       pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
                          (unsigned long) tsc_hz / 1000,
                          (unsigned long) tsc_hz % 1000);
  
@@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void)
         if (ebx != UINT_MAX)
                 x86_platform.calibrate_tsc = vmware_get_tsc_khz;
         else
-               printk(KERN_WARNING
-                      "Failed to get TSC freq from the hypervisor\n");
+               pr_warn("Failed to get TSC freq from the hypervisor\n");
  }
  
  /*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c

index 58f34319b29ab64aee4bd21e6ff30367d478c262..9ef978d69c22e00cffdce3592cbe5a495027a058 100644 (file)
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -57,10 +57,9 @@ struct crash_elf_data {
         struct kimage *image;
         /*
          * Total number of ram ranges we have after various adjustments for
-        * GART, crash reserved region etc.
+        * crash reserved region, etc.
          */
         unsigned int max_nr_ranges;
-       unsigned long gart_start, gart_end;
  
         /* Pointer to elf header */
         void *ehdr;
@@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
         return 0;
  }
  
-static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
-{
-       struct crash_elf_data *ced = arg;
-
-       ced->gart_start = start;
-       ced->gart_end = end;
-
-       /* Not expecting more than 1 gart aperture */
-       return 1;
-}
-
  
  /* Gather all the required information to prepare elf headers for ram regions */
  static void fill_up_crash_elf_data(struct crash_elf_data *ced,
@@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
  
         ced->max_nr_ranges = nr_ranges;
  
-       /*
-        * We don't create ELF headers for GART aperture as an attempt
-        * to dump this memory in second kernel leads to hang/crash.
-        * If gart aperture is present, one needs to exclude that region
-        * and that could lead to need of extra phdr.
-        */
-       walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
-                               ced, get_gart_ranges_callback);
-
-       /*
-        * If we have gart region, excluding that could potentially split
-        * a memory range, resulting in extra header. Account for  that.
-        */
-       if (ced->gart_end)
-               ced->max_nr_ranges++;
-
         /* Exclusion of crash region could split memory ranges */
         ced->max_nr_ranges++;
  
@@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
                         return ret;
         }
  
-       /* Exclude GART region */
-       if (ced->gart_end) {
-               ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
-               if (ret)
-                       return ret;
-       }
-
         return ret;
  }
  
@@ -599,12 +564,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
         /* Add ACPI tables */
         cmd.type = E820_ACPI;
         flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-       walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+       walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
                        memmap_entry_callback);
  
         /* Add ACPI Non-volatile Storage */
         cmd.type = E820_NVS;
-       walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+       walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
                         memmap_entry_callback);
  
         /* Add crashk_low_res region */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c

index 569c1e4f96feb897956a64e35bd8fccb07dca6c6..621b501f89351146b84b090b7160166bb9d5907e 100644 (file)
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -24,6 +24,7 @@
  #include <asm/e820.h>
  #include <asm/proto.h>
  #include <asm/setup.h>
+#include <asm/cpufeature.h>
  
  /*
   * The e820 map is the map that gets modified e.g. with command line parameters
@@ -925,6 +926,41 @@ static const char *e820_type_to_string(int e820_type)
         }
  }
  
+static unsigned long e820_type_to_iomem_type(int e820_type)
+{
+       switch (e820_type) {
+       case E820_RESERVED_KERN:
+       case E820_RAM:
+               return IORESOURCE_SYSTEM_RAM;
+       case E820_ACPI:
+       case E820_NVS:
+       case E820_UNUSABLE:
+       case E820_PRAM:
+       case E820_PMEM:
+       default:
+               return IORESOURCE_MEM;
+       }
+}
+
+static unsigned long e820_type_to_iores_desc(int e820_type)
+{
+       switch (e820_type) {
+       case E820_ACPI:
+               return IORES_DESC_ACPI_TABLES;
+       case E820_NVS:
+               return IORES_DESC_ACPI_NV_STORAGE;
+       case E820_PMEM:
+               return IORES_DESC_PERSISTENT_MEMORY;
+       case E820_PRAM:
+               return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+       case E820_RESERVED_KERN:
+       case E820_RAM:
+       case E820_UNUSABLE:
+       default:
+               return IORES_DESC_NONE;
+       }
+}
+
  static bool do_mark_busy(u32 type, struct resource *res)
  {
         /* this is the legacy bios/dos rom-shadow + mmio region */
@@ -967,7 +1003,8 @@ void __init e820_reserve_resources(void)
                 res->start = e820.map[i].addr;
                 res->end = end;
  
-               res->flags = IORESOURCE_MEM;
+               res->flags = e820_type_to_iomem_type(e820.map[i].type);
+               res->desc = e820_type_to_iores_desc(e820.map[i].type);
  
                 /*
                  * don't register the region that could be conflicted with
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c

index d25097c3fc1d1af8af35c156f05121f9f4d46a94..d5804adfa6da6c3d495b124faf47c288e2c0c6cf 100644 (file)
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -409,8 +409,10 @@ static inline void copy_init_fpstate_to_fpregs(void)
  {
         if (use_xsave())
                 copy_kernel_to_xregs(&init_fpstate.xsave, -1);
-       else
+       else if (static_cpu_has(X86_FEATURE_FXSR))
                 copy_kernel_to_fxregs(&init_fpstate.fxsave);
+       else
+               copy_kernel_to_fregs(&init_fpstate.fsave);
  }
  
  /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c

index 6d9f0a7ef4c8e5da3d596444242de7478d7b9356..bd08fb77073d833ec8f1e27bc67d20fa37b559dc 100644 (file)
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
         cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
         write_cr0(cr0);
  
-       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-                    : "+m" (fsw), "+m" (fcw));
+       if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+               asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+                            : "+m" (fsw), "+m" (fcw));
  
-       if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-               set_cpu_cap(c, X86_FEATURE_FPU);
-       else
-               clear_cpu_cap(c, X86_FEATURE_FPU);
+               if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+                       set_cpu_cap(c, X86_FEATURE_FPU);
+               else
+                       clear_cpu_cap(c, X86_FEATURE_FPU);
+       }
  
  #ifndef CONFIG_MATH_EMULATION
         if (!cpu_has_fpu) {
@@ -132,7 +134,7 @@ static void __init fpu__init_system_generic(void)
          * Set up the legacy init FPU context. (xstate init might overwrite this
          * with a more modern format, if the CPU supports it.)
          */
-       fpstate_init_fxstate(&init_fpstate.fxsave);
+       fpstate_init(&init_fpstate);
  
         fpu__init_system_mxcsr();
  }
@@ -300,12 +302,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
  static void __init fpu__clear_eager_fpu_features(void)
  {
         setup_clear_cpu_cap(X86_FEATURE_MPX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX2);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
  }
  
  /*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c

index d425cda5ae6d0f1544c5d8decdb14d8682d7f7dd..6e8354f5a59353bd18801860980d779a19c61f74 100644 (file)
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void)
         setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
         setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
         setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+       setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
+       setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
+       setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
         setup_clear_cpu_cap(X86_FEATURE_MPX);
         setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
  }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c

index 29408d6d66267c12ce4eab826e38bac3d8367d17..702547ce33c9c7a33fd321fdb94367af5515dd8b 100644 (file)
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  static unsigned long text_ip_addr(unsigned long ip)
  {
         /*
-        * On x86_64, kernel text mappings are mapped read-only with
-        * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-        * of the kernel text mapping to modify the kernel text.
+        * On x86_64, kernel text mappings are mapped read-only, so we use
+        * the kernel identity mapping instead of the kernel text mapping
+        * to modify the kernel text.
          *
          * For 32bit kernels, these mappings are same and we can use
          * kernel identity mapping to modify code.
@@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { }
  #endif
  
  /* Defined as markers to the end of the ftrace default trampolines */
-extern void ftrace_caller_end(void);
  extern void ftrace_regs_caller_end(void);
-extern void ftrace_return(void);
+extern void ftrace_epilogue(void);
  extern void ftrace_caller_op_ptr(void);
  extern void ftrace_regs_caller_op_ptr(void);
  
@@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
                 op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
         } else {
                 start_offset = (unsigned long)ftrace_caller;
-               end_offset = (unsigned long)ftrace_caller_end;
+               end_offset = (unsigned long)ftrace_epilogue;
                 op_offset = (unsigned long)ftrace_caller_op_ptr;
         }
  
@@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
  
         /*
          * Allocate enough size to store the ftrace_caller code,
-        * the jmp to ftrace_return, as well as the address of
+        * the jmp to ftrace_epilogue, as well as the address of
          * the ftrace_ops this trampoline is used for.
          */
         trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
@@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
  
         ip = (unsigned long)trampoline + size;
  
-       /* The trampoline ends with a jmp to ftrace_return */
-       jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return);
+       /* The trampoline ends with a jmp to ftrace_epilogue */
+       jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
         memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
  
         /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c

index 2c0f3407bd1f1ae8adcf43cacdb479c91c096669..1f4422d5c8d013992d6b97a6a08734bde5bdb2b1 100644 (file)
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
  /* Wipe all early page tables except for the kernel symbol map */
  static void __init reset_early_page_tables(void)
  {
-       unsigned long i;
-
-       for (i = 0; i < PTRS_PER_PGD-1; i++)
-               early_level4_pgt[i].pgd = 0;
-
+       memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
         next_early_pgt = 0;
-
         write_cr3(__pa_nodebug(early_level4_pgt));
  }
  
@@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void)
  int __init early_make_pgtable(unsigned long address)
  {
         unsigned long physaddr = address - __PAGE_OFFSET;
-       unsigned long i;
         pgdval_t pgd, *pgd_p;
         pudval_t pud, *pud_p;
         pmdval_t pmd, *pmd_p;
@@ -81,8 +75,7 @@ again:
                 }
  
                 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
-               for (i = 0; i < PTRS_PER_PUD; i++)
-                       pud_p[i] = 0;
+               memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
                 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
         }
         pud_p += pud_index(address);
@@ -97,8 +90,7 @@ again:
                 }
  
                 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-               for (i = 0; i < PTRS_PER_PMD; i++)
-                       pmd_p[i] = 0;
+               memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
                 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
         }
         pmd = (physaddr & PMD_MASK) + early_pmd_flags;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S

index 6bc9ae24b6d2a74930701c0c6ea60fe090a3ebbc..af1112980dd411334ef59d7d0a7b818946f2137d 100644 (file)
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,7 +19,7 @@
  #include <asm/setup.h>
  #include <asm/processor-flags.h>
  #include <asm/msr-index.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/percpu.h>
  #include <asm/nops.h>
  #include <asm/bootparam.h>
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index ffdc0e8603902b12b0cd44fd175d806b046ccb4b..22fbf9df61bb4eecbb5ffe530562b56c1def90b8 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,7 +38,6 @@
  #define pud_index(x)   (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
  
  L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
-L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
  L4_START_KERNEL = pgd_index(__START_KERNEL_map)
  L3_START_KERNEL = pud_index(__START_KERNEL_map)
  
@@ -76,9 +75,7 @@ startup_64:
         subq    $_text - __START_KERNEL_map, %rbp
  
         /* Is the address not 2M aligned? */
-       movq    %rbp, %rax
-       andl    $~PMD_PAGE_MASK, %eax
-       testl   %eax, %eax
+       testl   $~PMD_PAGE_MASK, %ebp
         jnz     bad_address
  
         /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c

index b8e6ff5cd5d055892715af36f1e9b72340f7b18c..be0ebbb6d1d144fc86bcab6716871ea8cdaa4744 100644 (file)
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
  #include <linux/pm.h>
  #include <linux/io.h>
  
+#include <asm/cpufeature.h>
  #include <asm/irqdomain.h>
  #include <asm/fixmap.h>
  #include <asm/hpet.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c

index 44256a62702b2c51077fc0b8b82a904ed122b9f6..ed15cd486d06347626c080a0081e3eed01ac9128 100644 (file)
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
  int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
  {
         int err;
-#ifdef CONFIG_DEBUG_RODATA
         char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
  
         bpt->type = BP_BREAKPOINT;
         err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
                 return err;
         err = probe_kernel_write((char *)bpt->bpt_addr,
                                  arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
         if (!err)
                 return err;
         /*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
         if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
                 return -EINVAL;
         bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
         return err;
  }
  
  int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
  {
-#ifdef CONFIG_DEBUG_RODATA
         int err;
         char opc[BREAK_INSTR_SIZE];
  
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
         if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
                 goto knl_write;
         return err;
+
  knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
         return probe_kernel_write((char *)bpt->bpt_addr,
                                   (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
  }
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c

index 1deffe6cc87367631fe23b4c5d4b6be697ff4b0e..0f05deeff5ce2e2af5aa7fb1f2fe1f59f4ed8348 100644 (file)
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
                  * In case the user-specified fault handler returned
                  * zero, try to fix up.
                  */
-               if (fixup_exception(regs))
+               if (fixup_exception(regs, trapnr))
                         return 1;
  
                 /*
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S

index 87e1762e2bca74a3f22004d09d9069d72b41b62b..ed48a9f465f84685d1ecdfb78075f869001e369b 100644 (file)
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -168,12 +168,14 @@ GLOBAL(ftrace_call)
         restore_mcount_regs
  
         /*
-        * The copied trampoline must call ftrace_return as it
+        * The copied trampoline must call ftrace_epilogue as it
          * still may need to call the function graph tracer.
+        *
+        * The code up to this label is copied into trampolines so
+        * think twice before adding any new code or changing the
+        * layout here.
          */
-GLOBAL(ftrace_caller_end)
-
-GLOBAL(ftrace_return)
+GLOBAL(ftrace_epilogue)
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  GLOBAL(ftrace_graph_call)
@@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call)
         popfq
  
         /*
-        * As this jmp to ftrace_return can be a short jump
+        * As this jmp to ftrace_epilogue can be a short jump
          * it must not be copied into the trampoline.
          * The trampoline will add the code to jump
          * to the return.
          */
  GLOBAL(ftrace_regs_caller_end)
  
-       jmp ftrace_return
+       jmp ftrace_epilogue
  
  END(ftrace_regs_caller)
  
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c

index 30ca7607cbbbbcae4793aa5c14d8f73bbd784d71..97340f2c437c64def7a83f75a8f9b0bc6a968451 100644 (file)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
         processor.cpuflag = CPU_ENABLED;
         processor.cpufeature = (boot_cpu_data.x86 << 8) |
             (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-       processor.featureflag = boot_cpu_data.x86_capability[0];
+       processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
         processor.reserved[0] = 0;
         processor.reserved[1] = 0;
         for (i = 0; i < 2; i++) {
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c

index 64f9616f93f1ec39a85494857d2d07728a7ec5d2..7f3550acde1b8ce604f4b5cc8b53c232450c524f 100644 (file)
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,7 @@
  #include <linux/uaccess.h>
  #include <linux/gfp.h>
  
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
  #include <asm/msr.h>
  
  static struct class *msr_class;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c

index 8a2cdd736fa4da82374fa9392e76b5716cc0f89a..04b132a767f116e8ff35efcbbc036e331a145a4b 100644 (file)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -30,6 +30,7 @@
  #include <asm/nmi.h>
  #include <asm/x86_init.h>
  #include <asm/reboot.h>
+#include <asm/cache.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/nmi.h>
@@ -69,7 +70,7 @@ struct nmi_stats {
  
  static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
  
-static int ignore_nmis;
+static int ignore_nmis __read_mostly;
  
  int unknown_nmi_panic;
  /*
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c

index 14415aff18136524cfa3d4c9b07bdde6ee0d6f54..92f70147a9a673e74b3753637adbb08e2a4869fa 100644 (file)
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data)
  
  static __init int register_e820_pmem(void)
  {
-       char *pmem = "Persistent Memory (legacy)";
         struct platform_device *pdev;
         int rc;
  
-       rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+       rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+                                IORESOURCE_MEM, 0, -1, NULL, found);
         if (rc <= 0)
                 return 0;
  
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 9f7c21c22477e59462d72e930d79a4c2a238a051..2915d54e9dd5f730558fba3ed2e7f3c9d4cd5a5b 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
           */
         .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },
  #endif
+#ifdef CONFIG_X86_32
+       .SYSENTER_stack_canary  = STACK_END_MAGIC,
+#endif
  };
  EXPORT_PER_CPU_SYMBOL(cpu_tss);
  
@@ -418,9 +421,9 @@ static void mwait_idle(void)
         if (!current_set_polling_and_test()) {
                 trace_cpu_idle_rcuidle(1, smp_processor_id());
                 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
-                       smp_mb(); /* quirk */
+                       mb(); /* quirk */
                         clflush((void *)&current_thread_info()->flags);
-                       smp_mb(); /* quirk */
+                       mb(); /* quirk */
                 }
  
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index d3d80e6d42a20ea665325b4cf2a5e7be3c43289a..aa52c10094755e2fa9fdd5ac7a39b0c36effac8a 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -152,21 +152,21 @@ static struct resource data_resource = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c

index cb6282c3638ffbd32bcb33663d8cefc17eac8a8e..548ddf7d6fd2085676328ffcde6375ce3e88b477 100644 (file)
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,7 +61,38 @@
         regs->seg = GET_SEG(seg) | 3;                   \
  } while (0)
  
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
+#ifdef CONFIG_X86_64
+/*
+ * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
+ * alone.  Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+       u32 ar;
+       asm volatile ("lar %[old_ss], %[ar]\n\t"
+                     "jz 1f\n\t"               /* If invalid: */
+                     "xorl %[ar], %[ar]\n\t"   /* set ar = 0 */
+                     "1:"
+                     : [ar] "=r" (ar)
+                     : [old_ss] "rm" ((u16)regs->ss));
+
+       /*
+        * For a valid 64-bit user context, we need DPL 3, type
+        * read-write data or read-write exp-down data, and S and P
+        * set.  We can't use VERW because VERW doesn't check the
+        * P bit.
+        */
+       ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+       if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+           ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+               regs->ss = __USER_DS;
+}
+#endif
+
+static int restore_sigcontext(struct pt_regs *regs,
+                             struct sigcontext __user *sc,
+                             unsigned long uc_flags)
  {
         unsigned long buf_val;
         void __user *buf;
@@ -94,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
                 COPY(r15);
  #endif /* CONFIG_X86_64 */
  
-#ifdef CONFIG_X86_32
                 COPY_SEG_CPL3(cs);
                 COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-               /* Kernel saves and restores only the CS segment register on signals,
-                * which is the bare minimum needed to allow mixed 32/64-bit code.
-                * App's signal handler can save/restore other segments if needed. */
-               COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+               /*
+                * Fix up SS if needed for the benefit of old DOSEMU and
+                * CRIU.
+                */
+               if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
+                            user_64bit_mode(regs)))
+                       force_valid_ss(regs);
+#endif
  
                 get_user_ex(tmpflags, &sc->flags);
                 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -165,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                 put_user_ex(regs->cs, &sc->cs);
                 put_user_ex(0, &sc->gs);
                 put_user_ex(0, &sc->fs);
+               put_user_ex(regs->ss, &sc->ss);
  #endif /* CONFIG_X86_32 */
  
                 put_user_ex(fpstate, &sc->fpstate);
@@ -403,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
         return 0;
  }
  #else /* !CONFIG_X86_32 */
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+       unsigned long flags;
+
+       if (cpu_has_xsave)
+               flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+       else
+               flags = UC_SIGCONTEXT_SS;
+
+       if (likely(user_64bit_mode(regs)))
+               flags |= UC_STRICT_RESTORE_SS;
+
+       return flags;
+}
+
  static int __setup_rt_frame(int sig, struct ksignal *ksig,
                             sigset_t *set, struct pt_regs *regs)
  {
@@ -422,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
  
         put_user_try {
                 /* Create the ucontext.  */
-               if (cpu_has_xsave)
-                       put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-               else
-                       put_user_ex(0, &frame->uc.uc_flags);
+               put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
                 put_user_ex(0, &frame->uc.uc_link);
                 save_altstack_ex(&frame->uc.uc_stack, regs->sp);
  
@@ -459,10 +506,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
  
         regs->sp = (unsigned long)frame;
  
-       /* Set up the CS register to run signal handlers in 64-bit mode,
-          even if the handler happens to be interrupting 32-bit code. */
+       /*
+        * Set up the CS and SS registers to run signal handlers in
+        * 64-bit mode, even if the handler happens to be interrupting
+        * 32-bit or 16-bit code.
+        *
+        * SS is subtle.  In 64-bit mode, we don't need any particular
+        * SS descriptor, but we do need SS to be valid.  It's possible
+        * that the old SS is entirely bogus -- this can happen if the
+        * signal we're trying to deliver is #GP or #SS caused by a bad
+        * SS value.  We also have a compatbility issue here: DOSEMU
+        * relies on the contents of the SS register indicating the
+        * SS value at the time of the signal, even though that code in
+        * DOSEMU predates sigreturn's ability to restore SS.  (DOSEMU
+        * avoids relying on sigreturn to restore SS; instead it uses
+        * a trampoline.)  So we do our best: if the old SS was valid,
+        * we keep it.  Otherwise we replace it.
+        */
         regs->cs = __USER_CS;
  
+       if (unlikely(regs->ss != __USER_DS))
+               force_valid_ss(regs);
+
         return 0;
  }
  #endif /* CONFIG_X86_32 */
@@ -489,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
  
         put_user_try {
                 /* Create the ucontext.  */
-               if (cpu_has_xsave)
-                       put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-               else
-                       put_user_ex(0, &frame->uc.uc_flags);
+               put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
                 put_user_ex(0, &frame->uc.uc_link);
                 compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
                 put_user_ex(0, &frame->uc.uc__pad0);
@@ -554,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void)
  
         set_current_blocked(&set);
  
-       if (restore_sigcontext(regs, &frame->sc))
+       /*
+        * x86_32 has no uc_flags bits relevant to restore_sigcontext.
+        * Save a few cycles by skipping the __get_user.
+        */
+       if (restore_sigcontext(regs, &frame->sc, 0))
                 goto badframe;
         return regs->ax;
  
@@ -570,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void)
         struct pt_regs *regs = current_pt_regs();
         struct rt_sigframe __user *frame;
         sigset_t set;
+       unsigned long uc_flags;
  
         frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
         if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                 goto badframe;
         if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
                 goto badframe;
+       if (__get_user(uc_flags, &frame->uc.uc_flags))
+               goto badframe;
  
         set_current_blocked(&set);
  
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                 goto badframe;
  
         if (restore_altstack(&frame->uc.uc_stack))
@@ -692,12 +761,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  
  static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
  {
-#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+#ifdef CONFIG_X86_64
+       if (is_ia32_task())
+               return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+       return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
         return __NR_restart_syscall;
-#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
-       return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
-               __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
-#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+#endif
  }
  
  /*
@@ -763,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
         struct pt_regs *regs = current_pt_regs();
         struct rt_sigframe_x32 __user *frame;
         sigset_t set;
+       unsigned long uc_flags;
  
         frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
  
@@ -770,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
                 goto badframe;
         if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
                 goto badframe;
+       if (__get_user(uc_flags, &frame->uc.uc_flags))
+               goto badframe;
  
         set_current_blocked(&set);
  
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                 goto badframe;
  
         if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 24d57f77b3c19615840ac4f09c8c0fd299864698..3bf1e0b5f827ae43dca2b2c0c3c4040e56d15c56 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
  DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
  EXPORT_PER_CPU_SYMBOL(cpu_info);
  
+/* Logical package management. We might want to allocate that dynamically */
+static int *physical_to_logical_pkg __read_mostly;
+static unsigned long *physical_package_map __read_mostly;;
+static unsigned long *logical_package_map  __read_mostly;
+static unsigned int max_physical_pkg_id __read_mostly;
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+
  static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
  {
         unsigned long flags;
@@ -251,6 +259,97 @@ static void notrace start_secondary(void *unused)
         cpu_startup_entry(CPUHP_ONLINE);
  }
  
+int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+{
+       unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+       /* Called from early boot ? */
+       if (!physical_package_map)
+               return 0;
+
+       if (pkg >= max_physical_pkg_id)
+               return -EINVAL;
+
+       /* Set the logical package id */
+       if (test_and_set_bit(pkg, physical_package_map))
+               goto found;
+
+       if (pkg < __max_logical_packages) {
+               set_bit(pkg, logical_package_map);
+               physical_to_logical_pkg[pkg] = pkg;
+               goto found;
+       }
+       new = find_first_zero_bit(logical_package_map, __max_logical_packages);
+       if (new >= __max_logical_packages) {
+               physical_to_logical_pkg[pkg] = -1;
+               pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+                       apicid, pkg);
+               return -ENOSPC;
+       }
+       set_bit(new, logical_package_map);
+       pr_info("APIC(%x) Converting physical %u to logical package %u\n",
+               apicid, pkg, new);
+       physical_to_logical_pkg[pkg] = new;
+
+found:
+       cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+       return 0;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+       if (phys_pkg >= max_physical_pkg_id)
+               return -1;
+       return physical_to_logical_pkg[phys_pkg];
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+static void __init smp_init_package_map(void)
+{
+       unsigned int ncpus, cpu;
+       size_t size;
+
+       /*
+        * Today neither Intel nor AMD support heterogenous systems. That
+        * might change in the future....
+        */
+       ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+       __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+
+       /*
+        * Possibly larger than what we need as the number of apic ids per
+        * package can be smaller than the actual used apic ids.
+        */
+       max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
+       size = max_physical_pkg_id * sizeof(unsigned int);
+       physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
+       memset(physical_to_logical_pkg, 0xff, size);
+       size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
+       physical_package_map = kzalloc(size, GFP_KERNEL);
+       size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
+       logical_package_map = kzalloc(size, GFP_KERNEL);
+
+       pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+       for_each_present_cpu(cpu) {
+               unsigned int apicid = apic->cpu_present_to_apicid(cpu);
+
+               if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
+                       continue;
+               if (!topology_update_package_map(apicid, cpu))
+                       continue;
+               pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
+               per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
+               set_cpu_possible(cpu, false);
+               set_cpu_present(cpu, false);
+       }
+}
+
  void __init smp_store_boot_cpu_info(void)
  {
         int id = 0; /* CPU 0 */
@@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void)
  
         *c = boot_cpu_data;
         c->cpu_index = id;
+       smp_init_package_map();
  }
  
  /*
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c

index 3f92ce07e525fd41167a64f9d7c4df1bfbe673bf..27538f183c3b15d9d59e225f2612b12660ee979b 100644 (file)
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
          * by the error message
          */
  
-#ifdef CONFIG_DEBUG_RODATA
         /* Test 3: Check if the .rodata section is executable */
         if (rodata_test_data != 0xC3) {
                 printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
                 printk(KERN_ERR "test_nx: .rodata section is executable\n");
                 ret = -ENODEV;
         }
-#endif
  
  #if 0
         /* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c

index 5ecbfe5099dad68e140b85ffdaefc75818afc6d9..cb4a01b41e277887b6769e4a17d60cb4139f9864 100644 (file)
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
  }
  
  MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
  MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index ade185a46b1da63cea0be1ce47731e301f9d1d5a..5c9ca2bb9fe95c249680f69e076a2320e1855aa1 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
  DECLARE_BITMAP(used_vectors, NR_VECTORS);
  EXPORT_SYMBOL_GPL(used_vectors);
  
-static inline void conditional_sti(struct pt_regs *regs)
+static inline void cond_local_irq_enable(struct pt_regs *regs)
  {
         if (regs->flags & X86_EFLAGS_IF)
                 local_irq_enable();
  }
  
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
-       preempt_count_inc();
-       if (regs->flags & X86_EFLAGS_IF)
-               local_irq_enable();
-}
-
-static inline void conditional_cli(struct pt_regs *regs)
-{
-       if (regs->flags & X86_EFLAGS_IF)
-               local_irq_disable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void cond_local_irq_disable(struct pt_regs *regs)
  {
         if (regs->flags & X86_EFLAGS_IF)
                 local_irq_disable();
-       preempt_count_dec();
  }
  
  void ist_enter(struct pt_regs *regs)
@@ -199,7 +185,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
         }
  
         if (!user_mode(regs)) {
-               if (!fixup_exception(regs)) {
+               if (!fixup_exception(regs, trapnr)) {
                         tsk->thread.error_code = error_code;
                         tsk->thread.trap_nr = trapnr;
                         die(str, regs, error_code);
@@ -262,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
         tsk->thread.error_code = error_code;
         tsk->thread.trap_nr = trapnr;
  
-#ifdef CONFIG_X86_64
         if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
             printk_ratelimit()) {
                 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
@@ -271,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
                 print_vma_addr(" in ", regs->ip);
                 pr_cont("\n");
         }
-#endif
  
         force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
  }
@@ -286,7 +270,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
  
         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
                         NOTIFY_STOP) {
-               conditional_sti(regs);
+               cond_local_irq_enable(regs);
                 do_trap(trapnr, signr, str, regs, error_code,
                         fill_trap_info(regs, signr, trapnr, &info));
         }
@@ -368,7 +352,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
         if (notify_die(DIE_TRAP, "bounds", regs, error_code,
                         X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
                 return;
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
  
         if (!user_mode(regs))
                 die("bounds", regs, error_code);
@@ -443,7 +427,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
         struct task_struct *tsk;
  
         RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
  
         if (v8086_mode(regs)) {
                 local_irq_enable();
@@ -453,7 +437,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
  
         tsk = current;
         if (!user_mode(regs)) {
-               if (fixup_exception(regs))
+               if (fixup_exception(regs, X86_TRAP_GP))
                         return;
  
                 tsk->thread.error_code = error_code;
@@ -517,9 +501,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
          * as we may switch to the interrupt stack.
          */
         debug_stack_usage_inc();
-       preempt_conditional_sti(regs);
+       preempt_disable();
+       cond_local_irq_enable(regs);
         do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-       preempt_conditional_cli(regs);
+       cond_local_irq_disable(regs);
+       preempt_enable_no_resched();
         debug_stack_usage_dec();
  exit:
         ist_exit(regs);
@@ -571,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
  NOKPROBE_SYMBOL(fixup_bad_iret);
  #endif
  
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+       /*
+        * We don't try for precision here.  If we're anywhere in the region of
+        * code that can be single-stepped in the SYSENTER entry path, then
+        * assume that this is a useless single-step trap due to SYSENTER
+        * being invoked with TF set.  (We don't know in advance exactly
+        * which instructions will be hit because BTF could plausibly
+        * be set.)
+        */
+#ifdef CONFIG_X86_32
+       return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+               (unsigned long)__end_SYSENTER_singlestep_region -
+               (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+       return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+               (unsigned long)__end_entry_SYSENTER_compat -
+               (unsigned long)entry_SYSENTER_compat;
+#else
+       return false;
+#endif
+}
+
  /*
   * Our handling of the processor debug registers is non-trivial.
   * We do not clear them on entry and exit from the kernel. Therefore
@@ -605,10 +614,41 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         ist_enter(regs);
  
         get_debugreg(dr6, 6);
+       /*
+        * The Intel SDM says:
+        *
+        *   Certain debug exceptions may clear bits 0-3. The remaining
+        *   contents of the DR6 register are never cleared by the
+        *   processor. To avoid confusion in identifying debug
+        *   exceptions, debug handlers should clear the register before
+        *   returning to the interrupted task.
+        *
+        * Keep it simple: clear DR6 immediately.
+        */
+       set_debugreg(0, 6);
  
         /* Filter out all the reserved bits which are preset to 1 */
         dr6 &= ~DR6_RESERVED;
  
+       /*
+        * The SDM says "The processor clears the BTF flag when it
+        * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
+        * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+        */
+       clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+       if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+                    is_sysenter_singlestep(regs))) {
+               dr6 &= ~DR_STEP;
+               if (!dr6)
+                       goto exit;
+               /*
+                * else we might have gotten a single-step trap and hit a
+                * watchpoint at the same time, in which case we should fall
+                * through and handle the watchpoint.
+                */
+       }
+
         /*
          * If dr6 has no reason to give us about the origin of this trap,
          * then it's very likely the result of an icebp/int01 trap.
@@ -617,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         if (!dr6 && user_mode(regs))
                 user_icebp = 1;
  
-       /* Catch kmemcheck conditions first of all! */
+       /* Catch kmemcheck conditions! */
         if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
                 goto exit;
  
-       /* DR6 may or may not be cleared by the CPU */
-       set_debugreg(0, 6);
-
-       /*
-        * The processor cleared BTF, so don't mark that we need it set.
-        */
-       clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
-
         /* Store the virtualized DR6 value */
         tsk->thread.debugreg6 = dr6;
  
@@ -648,24 +680,25 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         debug_stack_usage_inc();
  
         /* It's safe to allow irq's after DR6 has been saved */
-       preempt_conditional_sti(regs);
+       preempt_disable();
+       cond_local_irq_enable(regs);
  
         if (v8086_mode(regs)) {
                 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
                                         X86_TRAP_DB);
-               preempt_conditional_cli(regs);
+               cond_local_irq_disable(regs);
+               preempt_enable_no_resched();
                 debug_stack_usage_dec();
                 goto exit;
         }
  
-       /*
-        * Single-stepping through system calls: ignore any exceptions in
-        * kernel space, but re-enable TF when returning to user mode.
-        *
-        * We already checked v86 mode above, so we can check for kernel mode
-        * by just checking the CPL of CS.
-        */
-       if ((dr6 & DR_STEP) && !user_mode(regs)) {
+       if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+               /*
+                * Historical junk that used to handle SYSENTER single-stepping.
+                * This should be unreachable now.  If we survive for a while
+                * without anyone hitting this warning, we'll turn this into
+                * an oops.
+                */
                 tsk->thread.debugreg6 &= ~DR_STEP;
                 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
                 regs->flags &= ~X86_EFLAGS_TF;
@@ -673,10 +706,19 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         si_code = get_si_code(tsk->thread.debugreg6);
         if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
                 send_sigtrap(tsk, regs, error_code, si_code);
-       preempt_conditional_cli(regs);
+       cond_local_irq_disable(regs);
+       preempt_enable_no_resched();
         debug_stack_usage_dec();
  
  exit:
+#if defined(CONFIG_X86_32)
+       /*
+        * This is the most likely code path that involves non-trivial use
+        * of the SYSENTER stack.  Check that we haven't overrun it.
+        */
+       WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+            "Overran or corrupted SYSENTER stack\n");
+#endif
         ist_exit(regs);
  }
  NOKPROBE_SYMBOL(do_debug);
@@ -696,10 +738,10 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
  
         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
                 return;
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
  
         if (!user_mode(regs)) {
-               if (!fixup_exception(regs)) {
+               if (!fixup_exception(regs, trapnr)) {
                         task->thread.error_code = error_code;
                         task->thread.trap_nr = trapnr;
                         die(str, regs, error_code);
@@ -743,7 +785,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
  dotraplinkage void
  do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
  {
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
  }
  
  dotraplinkage void
@@ -756,7 +798,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
         if (read_cr0() & X86_CR0_EM) {
                 struct math_emu_info info = { };
  
-               conditional_sti(regs);
+               cond_local_irq_enable(regs);
  
                 info.regs = regs;
                 math_emulate(&info);
@@ -765,7 +807,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
  #endif
         fpu__restore(&current->thread.fpu); /* interrupts still off */
  #ifdef CONFIG_X86_32
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
  #endif
  }
  NOKPROBE_SYMBOL(do_device_not_available);
@@ -868,7 +910,7 @@ void __init trap_init(void)
  #endif
  
  #ifdef CONFIG_X86_32
-       set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+       set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
         set_bit(IA32_SYSCALL_VECTOR, used_vectors);
  #endif
  
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S

index 07efb35ee4bc3fd5f4452af68214de79dc936716..014ea59aa153e4f37725dd8771cdb37d88371e34 100644 (file)
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -30,7 +30,7 @@
   *     appropriately. Either display a message or halt.
   */
  
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/msr-index.h>
  
  verify_cpu:
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c

index e574b85465185fe273a839093452c26521ac3d1e..3dce1ca0a653091967f7089ce9e89c9d54399408 100644 (file)
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
         /* make room for real-mode segments */
         tsk->thread.sp0 += 16;
  
-       if (static_cpu_has_safe(X86_FEATURE_SEP))
+       if (static_cpu_has(X86_FEATURE_SEP))
                 tsk->thread.sysenter_cs = 0;
  
         load_sp0(tss, &tsk->thread);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 74e4bf11f562e0354c227518421e2375ec16fafa..5af9958cbdb6be4293b62181ffe104f435caf830 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
  jiffies_64 = jiffies;
  #endif
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
  /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
   *
   * However, kernel identity mappings will have different RWX permissions
   * to the pages mapping to text and to the pages padding (which are freed) the
   * text section. Hence kernel identity mappings will be broken to smaller
   * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
   */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN   . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
  
-#define X64_ALIGN_DEBUG_RODATA_END                             \
+#define X64_ALIGN_RODATA_END                                   \
                 . = ALIGN(HPAGE_SIZE);                          \
                 __end_rodata_hpage_align = .;
  
  #else
  
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
  
  #endif
  
@@ -112,13 +111,11 @@ SECTIONS
  
         EXCEPTION_TABLE(16) :text = 0x9090
  
-#if defined(CONFIG_DEBUG_RODATA)
         /* .text should occupy whole number of pages */
         . = ALIGN(PAGE_SIZE);
-#endif
-       X64_ALIGN_DEBUG_RODATA_BEGIN
+       X64_ALIGN_RODATA_BEGIN
         RO_DATA(PAGE_SIZE)
-       X64_ALIGN_DEBUG_RODATA_END
+       X64_ALIGN_RODATA_END
  
         /* Data */
         .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -195,6 +192,17 @@ SECTIONS
         :init
  #endif
  
+       /*
+        * Section for code used exclusively before alternatives are run. All
+        * references to such code must be patched out by alternatives, normally
+        * by using X86_FEATURE_ALWAYS CPU feature bit.
+        *
+        * See static_cpu_has() for an example.
+        */
+       .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+               *(.altinstr_aux)
+       }
+
         INIT_DATA_SECTION(16)
  
         .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index a0695be19864eda009742b142ba2046e13ad05f9..cd05942bc9189452d8ec7c0cebd96431cf8dd394 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
  EXPORT_SYMBOL(_copy_from_user);
  EXPORT_SYMBOL(_copy_to_user);
  
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
  EXPORT_SYMBOL(copy_page);
  EXPORT_SYMBOL(clear_page);
  
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 36591faed13be04d12c13fa520d46ca9df0dfcf8..3a045f39ed8114e24e375521135cb7d2296e9e7e 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
  static void apic_timer_expired(struct kvm_lapic *apic)
  {
         struct kvm_vcpu *vcpu = apic->vcpu;
-       wait_queue_head_t *q = &vcpu->wq;
+       struct swait_queue_head *q = &vcpu->wq;
         struct kvm_timer *ktimer = &apic->lapic_timer;
  
         if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
         atomic_inc(&apic->lapic_timer.pending);
         kvm_set_pending_timer(vcpu);
  
-       if (waitqueue_active(q))
-               wake_up_interruptible(q);
+       if (swait_active(q))
+               swake_up(q);
  
         if (apic_lvtt_tscdeadline(apic))
                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 95a955de5964bcc3f4aa6791a004e29b28504c13..1e7a49bfc94fb323cbb11782693cab89743f1133 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3721,13 +3721,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
  void
  reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
  {
+       bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
         /*
          * Passing "true" to the last argument is okay; it adds a check
          * on bit 8 of the SPTEs which KVM doesn't use anyway.
          */
         __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                 boot_cpu_data.x86_phys_bits,
-                               context->shadow_root_level, context->nx,
+                               context->shadow_root_level, uses_nx,
                                 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
                                 true);
  }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 0ff453749a909b9f0140b5de5d5d2dbb133e99d9..9bd8f44baded2318e8b5bc2a4773068a45a8723b 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1813,6 +1813,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                         return;
                 }
                 break;
+       case MSR_IA32_PEBS_ENABLE:
+               /* PEBS needs a quiescent period after being disabled (to write
+                * a record).  Disabling PEBS through VMX MSR swapping doesn't
+                * provide that period, so a CPU could write host's record into
+                * guest's memory.
+                */
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
  
         for (i = 0; i < m->nr; ++i)
@@ -1850,26 +1857,31 @@ static void reload_tss(void)
  
  static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
  {
-       u64 guest_efer;
-       u64 ignore_bits;
+       u64 guest_efer = vmx->vcpu.arch.efer;
+       u64 ignore_bits = 0;
  
-       guest_efer = vmx->vcpu.arch.efer;
+       if (!enable_ept) {
+               /*
+                * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+                * host CPUID is more efficient than testing guest CPUID
+                * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+                */
+               if (boot_cpu_has(X86_FEATURE_SMEP))
+                       guest_efer |= EFER_NX;
+               else if (!(guest_efer & EFER_NX))
+                       ignore_bits |= EFER_NX;
+       }
  
         /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-        * outside long mode
+        * LMA and LME handled by hardware; SCE meaningless outside long mode.
          */
-       ignore_bits = EFER_NX | EFER_SCE;
+       ignore_bits |= EFER_SCE;
  #ifdef CONFIG_X86_64
         ignore_bits |= EFER_LMA | EFER_LME;
         /* SCE is meaningful only in long mode on Intel */
         if (guest_efer & EFER_LMA)
                 ignore_bits &= ~(u64)EFER_SCE;
  #endif
-       guest_efer &= ~ignore_bits;
-       guest_efer |= host_efer & ignore_bits;
-       vmx->guest_msrs[efer_offset].data = guest_efer;
-       vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
  
         clear_atomic_switch_msr(vmx, MSR_EFER);
  
@@ -1880,16 +1892,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
          */
         if (cpu_has_load_ia32_efer ||
             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-               guest_efer = vmx->vcpu.arch.efer;
                 if (!(guest_efer & EFER_LMA))
                         guest_efer &= ~EFER_LME;
                 if (guest_efer != host_efer)
                         add_atomic_switch_msr(vmx, MSR_EFER,
                                               guest_efer, host_efer);
                 return false;
-       }
+       } else {
+               guest_efer &= ~ignore_bits;
+               guest_efer |= host_efer & ignore_bits;
  
-       return true;
+               vmx->guest_msrs[efer_offset].data = guest_efer;
+               vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+               return true;
+       }
  }
  
  static unsigned long segment_base(u16 selector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 4ba229ac3f4ff127dddf0c33fb91cc42f2e60b22..fd57d3ae7e16daf24f8cfdb4e708c5e4e9a9f3f9 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
          */
         reserve_top_address(lguest_data.reserve_mem);
  
-       /*
-        * If we don't initialize the lock dependency checker now, it crashes
-        * atomic_notifier_chain_register, then paravirt_disable_iospace.
-        */
-       lockdep_init();
-
         /* Hook in our special panic hypercall code. */
         atomic_notifier_chain_register(&panic_notifier_list, &paniced);
  
@@ -1535,7 +1529,7 @@ __init void lguest_init(void)
          */
         cpu_detect(&new_cpu_data);
         /* head.S usually sets up the first capability word, so do it here. */
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
  
         /* Math is always hard! */
         set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S

index a2fe51b00ccefc660850e1a6810dd4adee611d2e..65be7cfaf947228d454f2324541e5f5227d85183 100644 (file)
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,5 @@
  #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  
  /*
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c

index 422db000d72767adf718dc75db682e5df576e877..5cc78bf572325fb1c5b5d6f854bfe878c55dfeb3 100644 (file)
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -21,12 +21,16 @@ static inline int myisspace(u8 c)
   * @option: option string to look for
   *
   * Returns the position of that @option (starts counting with 1)
- * or 0 on not found.
+ * or 0 on not found.  @option will only be found if it is found
+ * as an entire word in @cmdline.  For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
   */
-int cmdline_find_option_bool(const char *cmdline, const char *option)
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+                          const char *option)
  {
         char c;
-       int len, pos = 0, wstart = 0;
+       int pos = 0, wstart = 0;
         const char *opptr = NULL;
         enum {
                 st_wordstart = 0,       /* Start of word/after whitespace */
@@ -37,11 +41,11 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
         if (!cmdline)
                 return -1;      /* No command line */
  
-       len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
-       if (!len)
-               return 0;
-
-       while (len--) {
+       /*
+        * This 'pos' check ensures we do not overrun
+        * a non-NULL-terminated 'cmdline'
+        */
+       while (pos < max_cmdline_size) {
                 c = *(char *)cmdline++;
                 pos++;
  
@@ -58,18 +62,35 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
                         /* fall through */
  
                 case st_wordcmp:
-                       if (!*opptr)
+                       if (!*opptr) {
+                               /*
+                                * We matched all the way to the end of the
+                                * option we were looking for.  If the
+                                * command-line has a space _or_ ends, then
+                                * we matched!
+                                */
                                 if (!c || myisspace(c))
                                         return wstart;
-                               else
-                                       state = st_wordskip;
-                       else if (!c)
+                               /*
+                                * We hit the end of the option, but _not_
+                                * the end of a word on the cmdline.  Not
+                                * a match.
+                                */
+                       } else if (!c) {
+                               /*
+                                * Hit the NULL terminator on the end of
+                                * cmdline.
+                                */
                                 return 0;
-                       else if (c != *opptr++)
-                               state = st_wordskip;
-                       else if (!len)          /* last word and is matching */
-                               return wstart;
-                       break;
+                       } else if (c == *opptr++) {
+                               /*
+                                * We are currently matching, so continue
+                                * to the next character on the cmdline.
+                                */
+                               break;
+                       }
+                       state = st_wordskip;
+                       /* fall through */
  
                 case st_wordskip:
                         if (!c)
@@ -82,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
  
         return 0;       /* Buffer overrun */
  }
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+       return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+}
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S

index 009f98216b7eb316c12847e42f50ac77d4f4b8a3..24ef1c2104d422c35a9ce783934306a0dd9215f1 100644 (file)
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,7 @@
  /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
  
  #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  
  /*
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S

index 27f89c79a44b7da6fc6aef54e3961426e542e7ed..2b0ef26da0bde8eabc38f5b42ef9e4aa02464ad4 100644 (file)
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -10,7 +10,7 @@
  #include <asm/current.h>
  #include <asm/asm-offsets.h>
  #include <asm/thread_info.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  #include <asm/asm.h>
  #include <asm/smap.h>
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c

index e912b2f6d36e5392b33bf6749f19a53cff6d7a06..2f07c291dcc855af5c483c55133d91a4dff21c92 100644 (file)
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops)
                  * Use cpu_tss as a cacheline-aligned, seldomly
                  * accessed per-cpu variable as the monitor target.
                  */
-               __monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+               __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
  
                 /*
                  * AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S

index 16698bba87deb9ff593d934e8a8fb1447be9b214..cbb8ee5830ff134f131533fd91ea8be35a853dba 100644 (file)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,7 +1,7 @@
  /* Copyright 2002 Andi Kleen */
  
  #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  
  /*
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
  .Lend:
         retq
  ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+       cmpl $8, %edx
+       /* Less than 8 bytes? Go to byte copy loop */
+       jb .L_no_whole_words
+
+       /* Check for bad alignment of source */
+       testl $7, %esi
+       /* Already aligned */
+       jz .L_8byte_aligned
+
+       /* Copy one byte at a time until source is 8-byte aligned */
+       movl %esi, %ecx
+       andl $7, %ecx
+       subl $8, %ecx
+       negl %ecx
+       subl %ecx, %edx
+.L_copy_leading_bytes:
+       movb (%rsi), %al
+       movb %al, (%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+       /* Figure out how many whole cache lines (64-bytes) to copy */
+       movl %edx, %ecx
+       andl $63, %edx
+       shrl $6, %ecx
+       jz .L_no_whole_cache_lines
+
+       /* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+       movq %r8, (%rdi)
+       movq %r9, 1*8(%rdi)
+       movq %r10, 2*8(%rdi)
+       movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+       movq %r8, 4*8(%rdi)
+       movq %r9, 5*8(%rdi)
+       movq %r10, 6*8(%rdi)
+       movq %r11, 7*8(%rdi)
+       leaq 64(%rsi), %rsi
+       leaq 64(%rdi), %rdi
+       decl %ecx
+       jnz .L_cache_w0
+
+       /* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+       movl %edx, %ecx
+       andl $7, %edx
+       shrl $3, %ecx
+       jz .L_no_whole_words
+
+       /* Copy trailing words */
+.L_copy_trailing_words:
+       movq (%rsi), %r8
+       mov %r8, (%rdi)
+       leaq 8(%rsi), %rsi
+       leaq 8(%rdi), %rdi
+       decl %ecx
+       jnz .L_copy_trailing_words
+
+       /* Any trailing bytes? */
+.L_no_whole_words:
+       andl %edx, %edx
+       jz .L_done_memcpy_trap
+
+       /* Copy trailing bytes */
+       movl %edx, %ecx
+.L_copy_trailing_bytes:
+       movb (%rsi), %al
+       movb %al, (%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .L_copy_trailing_bytes
+
+       /* Copy successful. Return true */
+.L_done_memcpy_trap:
+       xorq %rax, %rax
+       ret
+ENDPROC(memcpy_mcsafe)
+
+       .section .fixup, "ax"
+       /* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+       mov     $1, %rax
+       ret
+
+       .previous
+
+       _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S

index ca2afdd6d98ed2be90da6f9ea1624beb102f0fc3..90ce01bee00c17f173719652659edb5972ecef07 100644 (file)
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,7 @@
   *     - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   */
  #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  
  #undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S

index 2661fad0582716f780af9904dc5b7c62199a5c58..c9c81227ea37d14968b3acb3ccfa346e73cda5dc 100644 (file)
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,7 @@
  /* Copyright 2002 Andi Kleen, SuSE Labs */
  
  #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/alternative-asm.h>
  
  .weak memset
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c

index 903ec1e9c3263f7f7ce6e1f5a59e7a32b224dc85..9dd7e4b7fcdee4a847f98937b9a287b826366a2d 100644 (file)
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
  #include <linux/sort.h>
  #include <asm/uaccess.h>
  
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+                           struct pt_regs *, int);
+
  static inline unsigned long
  ex_insn_addr(const struct exception_table_entry *x)
  {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
  {
         return (unsigned long)&x->fixup + x->fixup;
  }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+       return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
  
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+                      struct pt_regs *regs, int trapnr)
  {
-       const struct exception_table_entry *fixup;
-       unsigned long new_ip;
+       regs->ip = ex_fixup_addr(fixup);
+       return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+                    struct pt_regs *regs, int trapnr)
+{
+       regs->ip = ex_fixup_addr(fixup);
+       regs->ax = trapnr;
+       return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+                  struct pt_regs *regs, int trapnr)
+{
+       /* Special hack for uaccess_err */
+       current_thread_info()->uaccess_err = 1;
+       regs->ip = ex_fixup_addr(fixup);
+       return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+       const struct exception_table_entry *e;
+       ex_handler_t handler;
+
+       e = search_exception_tables(ip);
+       if (!e)
+               return false;
+       handler = ex_fixup_handler(e);
+
+       return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+       const struct exception_table_entry *e;
+       ex_handler_t handler;
  
  #ifdef CONFIG_PNPBIOS
         if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
         }
  #endif
  
-       fixup = search_exception_tables(regs->ip);
-       if (fixup) {
-               new_ip = ex_fixup_addr(fixup);
-
-               if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-                       /* Special hack for uaccess_err */
-                       current_thread_info()->uaccess_err = 1;
-                       new_ip -= 0x7ffffff0;
-               }
-               regs->ip = new_ip;
-               return 1;
-       }
+       e = search_exception_tables(regs->ip);
+       if (!e)
+               return 0;
  
-       return 0;
+       handler = ex_fixup_handler(e);
+       return handler(e, regs, trapnr);
  }
  
  /* Restricted version used during very early boot */
  int __init early_fixup_exception(unsigned long *ip)
  {
-       const struct exception_table_entry *fixup;
+       const struct exception_table_entry *e;
         unsigned long new_ip;
+       ex_handler_t handler;
  
-       fixup = search_exception_tables(*ip);
-       if (fixup) {
-               new_ip = ex_fixup_addr(fixup);
+       e = search_exception_tables(*ip);
+       if (!e)
+               return 0;
  
-               if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-                       /* uaccess handling not supported during early boot */
-                       return 0;
-               }
+       new_ip  = ex_fixup_addr(e);
+       handler = ex_fixup_handler(e);
  
-               *ip = new_ip;
-               return 1;
-       }
+       /* special handling not supported during early boot */
+       if (handler != ex_handler_default)
+               return 0;
  
-       return 0;
+       *ip = new_ip;
+       return 1;
  }
  
  /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
                 i += 4;
                 p->fixup += i;
                 i += 4;
+               p->handler += i;
+               i += 4;
         }
  
         sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
                 i += 4;
                 p->fixup -= i;
                 i += 4;
+               p->handler -= i;
+               i += 4;
         }
  }
  
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index e830c71a13232f4305adf9ef1eb8ce6ffe201d5b..03898aea6e0f2697e2196e9de7f8fe3aeb200ec9 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -663,7 +663,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
         int sig;
  
         /* Are we prepared to handle this kernel fault? */
-       if (fixup_exception(regs)) {
+       if (fixup_exception(regs, X86_TRAP_PF)) {
                 /*
                  * Any interrupt that takes a fault gets the fixup. This makes
                  * the below recursive fault logic only apply to a faults from
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index cb4ef3de61f9ae9c95249876965a71a5d7b58cf8..2ebfbaf611424be1937c4e2288776d50d1c8ff9e 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
         return flag;
  }
  
-#ifdef CONFIG_DEBUG_RODATA
  const int rodata_test_data = 0xC3;
  EXPORT_SYMBOL_GPL(rodata_test_data);
  
@@ -960,5 +959,3 @@ void mark_rodata_ro(void)
         if (__supported_pte_mask & _PAGE_NX)
                 debug_checkwx();
  }
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index 5488d21123bd2edb3f71ded119495474f3e8b728..a40b755c67e31280ded6fe10d81a18927523059c 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1074,7 +1074,6 @@ void __init mem_init(void)
         mem_init_print_info(NULL);
  }
  
-#ifdef CONFIG_DEBUG_RODATA
  const int rodata_test_data = 0xC3;
  EXPORT_SYMBOL_GPL(rodata_test_data);
  
@@ -1166,8 +1165,6 @@ void mark_rodata_ro(void)
         debug_checkwx();
  }
  
-#endif
-
  int kern_addr_valid(unsigned long addr)
  {
         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index 9cf96d82147a8da38ce4f57413d75075003cb244..1c37e650acacff422460b1d9b705a867e01a09a5 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,7 +283,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                    __pa_symbol(__end_rodata) >> PAGE_SHIFT))
                 pgprot_val(forbidden) |= _PAGE_RW;
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
         /*
          * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
          * kernel text mappings for the large page aligned text, rodata sections
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c

index f4ae536b0914db1db521feabe09ae0ab681a4b87..04e2e7144beee51205df17a4e3df024833eac7db 100644 (file)
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                         return -EINVAL;
         }
  
-       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+       *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
                          cachemode2protval(pcm));
  
         return 0;
@@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
  
         /* Set prot based on lookup */
         pcm = lookup_memtype(pfn_t_to_phys(pfn));
-       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+       *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
                          cachemode2protval(pcm));
  
         return 0;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c

index 92e2eacb33216821e8b18821834c7ce972571f29..f65a33f505b683a02aa7543ea653e54a8cdedc28 100644 (file)
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -4,6 +4,7 @@
  
  #include <asm/pgtable.h>
  #include <asm/proto.h>
+#include <asm/cpufeature.h>
  
  static int disable_nx;
  
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c

index 50d86c0e9ba4973b707de5917cfae78692fd6f68..660a83c8287b61dc52c069832128a83f256258af 100644 (file)
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -24,7 +24,6 @@
  #include <asm/nmi.h>
  #include <asm/apic.h>
  #include <asm/processor.h>
-#include <asm/cpufeature.h>
  
  #include "op_x86_model.h"
  #include "op_counter.h"
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c

index 2d66db8f80f992d3b0609373502031aacf91f161..ed30e79347e86377198009dcafb1c0b26a9865df 100644 (file)
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -130,6 +130,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
  }
  EXPORT_SYMBOL_GPL(efi_query_variable_store);
  
+/*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+       if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+               return false;
+
+       if (!e820_all_mapped(start, start+size, E820_RAM))
+               return false;
+
+       return true;
+}
+
  /*
   * The UEFI specification makes it clear that the operating system is free to do
   * whatever it wants with boot services code after ExitBootServices() has been
@@ -147,26 +168,50 @@ void __init efi_reserve_boot_services(void)
                 efi_memory_desc_t *md = p;
                 u64 start = md->phys_addr;
                 u64 size = md->num_pages << EFI_PAGE_SHIFT;
+               bool already_reserved;
  
                 if (md->type != EFI_BOOT_SERVICES_CODE &&
                     md->type != EFI_BOOT_SERVICES_DATA)
                         continue;
-               /* Only reserve where possible:
-                * - Not within any already allocated areas
-                * - Not over any memory area (really needed, if above?)
-                * - Not within any part of the kernel
-                * - Not the bios reserved area
-               */
-               if ((start + size > __pa_symbol(_text)
-                               && start <= __pa_symbol(_end)) ||
-                       !e820_all_mapped(start, start+size, E820_RAM) ||
-                       memblock_is_region_reserved(start, size)) {
-                       /* Could not reserve, skip it */
-                       md->num_pages = 0;
-                       memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
-                                    start, start+size-1);
-               } else
+
+               already_reserved = memblock_is_region_reserved(start, size);
+
+               /*
+                * Because the following memblock_reserve() is paired
+                * with free_bootmem_late() for this region in
+                * efi_free_boot_services(), we must be extremely
+                * careful not to reserve, and subsequently free,
+                * critical regions of memory (like the kernel image) or
+                * those regions that somebody else has already
+                * reserved.
+                *
+                * A good example of a critical region that must not be
+                * freed is page zero (first 4Kb of memory), which may
+                * contain boot services code/data but is marked
+                * E820_RESERVED by trim_bios_range().
+                */
+               if (!already_reserved) {
                         memblock_reserve(start, size);
+
+                       /*
+                        * If we are the first to reserve the region, no
+                        * one else cares about it. We own it and can
+                        * free it later.
+                        */
+                       if (can_free_region(start, size))
+                               continue;
+               }
+
+               /*
+                * We don't own the region. We must not free it.
+                *
+                * Setting this bit for a boot services region really
+                * doesn't make sense as far as the firmware is
+                * concerned, but it does provide us with a way to tag
+                * those regions that must not be paired with
+                * free_bootmem_late().
+                */
+               md->attribute |= EFI_MEMORY_RUNTIME;
         }
  }
  
@@ -183,8 +228,8 @@ void __init efi_free_boot_services(void)
                     md->type != EFI_BOOT_SERVICES_DATA)
                         continue;
  
-               /* Could not reserve boot area */
-               if (!size)
+               /* Do not free, someone else owns it: */
+               if (md->attribute & EFI_MEMORY_RUNTIME)
                         continue;
  
                 free_bootmem_late(start, size);
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h

index 174781a404ff3cfbeb1f9dccaef820f7aa2e9a51..00c319048d52d78e58755f7da2639aa044e0b885 100644 (file)
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -3,7 +3,7 @@
  
  #include <asm/asm.h>
  #include <asm/segment.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
  #include <asm/cmpxchg.h>
  #include <asm/nops.h>
  
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c

index 439c0994b69689ade4fa49c6a9ead46e25597880..bfce503dffae23bb0b39c621f5d8c2e0d936fec5 100644 (file)
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
  
  #define old_mmap sys_old_mmap
  
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
  #include <asm/syscalls_32.h>
  
  #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
  
  extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
  
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c

index b74ea6c2c0e7b2fdb43d2f60231edc8c994333c6..f306413d3eb6e332d5ad10db63700101cafd40f0 100644 (file)
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,14 +35,11 @@
  #define stub_execveat sys_execveat
  #define stub_rt_sigreturn sys_rt_sigreturn
  
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
  #include <asm/syscalls_64.h>
  
  #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
  
  extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
  
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c

index ce7e3607a870feac770b277970b74099c3049208..470564bbd08ea16359e32ab8a275a80f40bcfbda 100644 (file)
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,14 +9,12 @@
  #include <asm/types.h>
  
  #ifdef __i386__
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
  static char syscalls[] = {
  #include <asm/syscalls_32.h>
  };
  #else
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
  static char syscalls[] = {
  #include <asm/syscalls_64.h>
  };
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index d09e4c9d7cc5b4044c4421bde8692aaa6f8b21be..2c261082eadf82f693d062e7f600660fcd56817f 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1654,7 +1654,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
         cpu_detect(&new_cpu_data);
         set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
         new_cpu_data.wp_works_ok = 1;
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
  #endif
  
         if (xen_start_info->mod_start) {
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c

index 724a08740a04b4a255ca080c103ec3b7381bc4f7..9466354d3e4962f14cdae33b09378a73f9c51a5d 100644 (file)
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -11,7 +11,7 @@
  #include "pmu.h"
  
  /* x86_pmu.handle_irq definition */
-#include "../kernel/cpu/perf_event.h"
+#include "../events/perf_event.h"
  
  #define XENPMU_IRQ_PROCESSING    1
  struct xenpmu {
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c

index 296b7a14893aabba895c578e8671e36b34875a37..b6f7fa3a1d403ea95428808f49edab475422e80a 100644 (file)
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -62,7 +62,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
         if (count < 0) {
                 return NULL;
         } else if (count > 0) {
-               resources = kmalloc(count * sizeof(struct resource),
+               resources = kzalloc(count * sizeof(struct resource),
                                     GFP_KERNEL);
                 if (!resources) {
                         dev_err(&adev->dev, "No memory for resources\n");
diff --git a/drivers/acpi/acpica/psargs.c b/drivers/acpi/acpica/psargs.c

index 305218539df28cda17b97992ff44a0dbaeb613de..d48cbed342c1dda500d5020c746a9caf1c72ad09 100644 (file)
--- a/drivers/acpi/acpica/psargs.c
+++ b/drivers/acpi/acpica/psargs.c
@@ -269,8 +269,7 @@ acpi_ps_get_next_namepath(struct acpi_walk_state *walk_state,
          */
         if (ACPI_SUCCESS(status) &&
             possible_method_call && (node->type == ACPI_TYPE_METHOD)) {
-               if (GET_CURRENT_ARG_TYPE(walk_state->arg_types) ==
-                   ARGP_SUPERNAME) {
+               if (walk_state->opcode == AML_UNLOAD_OP) {
                         /*
                          * acpi_ps_get_next_namestring has increased the AML pointer,
                          * so we need to restore the saved AML pointer for method call.
@@ -697,7 +696,7 @@ static union acpi_parse_object *acpi_ps_get_next_field(struct acpi_parse_state
   *
   * PARAMETERS:  walk_state          - Current state
   *              parser_state        - Current parser state object
- *              arg_type            - The parser argument type (ARGP_*)
+ *              arg_type            - The argument type (AML_*_ARG)
   *              return_arg          - Where the next arg is returned
   *
   * RETURN:      Status, and an op object containing the next argument.
@@ -817,9 +816,9 @@ acpi_ps_get_next_arg(struct acpi_walk_state *walk_state,
                                 return_ACPI_STATUS(AE_NO_MEMORY);
                         }
  
-                       /* super_name allows argument to be a method call */
+                       /* To support super_name arg of Unload */
  
-                       if (arg_type == ARGP_SUPERNAME) {
+                       if (walk_state->opcode == AML_UNLOAD_OP) {
                                 status =
                                     acpi_ps_get_next_namepath(walk_state,
                                                               parser_state, arg,
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c

index 0431883653bed9d3c0394368fe87ced99043f185..559c1173de1c99177ba702c60d27993826510e9c 100644 (file)
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -519,7 +519,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
                              u64 param3, u64 param4)
  {
         int rc;
-       unsigned long pfn;
+       u64 base_addr, size;
  
         /* If user manually set "flags", make sure it is legal */
         if (flags && (flags &
@@ -545,10 +545,17 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
         /*
          * Disallow crazy address masks that give BIOS leeway to pick
          * injection address almost anywhere. Insist on page or
-        * better granularity and that target address is normal RAM.
+        * better granularity and that target address is normal RAM or
+        * NVDIMM.
          */
-       pfn = PFN_DOWN(param1 & param2);
-       if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK))
+       base_addr = param1 & param2;
+       size = ~param2 + 1;
+
+       if (((param2 & PAGE_MASK) != PAGE_MASK) ||
+           ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+                               != REGION_INTERSECTS) &&
+            (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
+                               != REGION_INTERSECTS)))
                 return -EINVAL;
  
  inject:
diff --git a/drivers/base/property.c b/drivers/base/property.c

index c359351d50f1c99e9758b74ea84d97e76b402b0f..a163f2c59aa36571359e75661d52cfdc55d4ef3b 100644 (file)
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -218,7 +218,7 @@ bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
         bool ret;
  
         ret = __fwnode_property_present(fwnode, propname);
-       if (ret == false && fwnode && fwnode->secondary)
+       if (ret == false && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                 ret = __fwnode_property_present(fwnode->secondary, propname);
         return ret;
  }
@@ -423,7 +423,7 @@ EXPORT_SYMBOL_GPL(device_property_match_string);
         int _ret_;                                                                      \
         _ret_ = FWNODE_PROP_READ(_fwnode_, _propname_, _type_, _proptype_,              \
                                  _val_, _nval_);                                        \
-       if (_ret_ == -EINVAL && _fwnode_ && _fwnode_->secondary)                        \
+       if (_ret_ == -EINVAL && _fwnode_ && !IS_ERR_OR_NULL(_fwnode_->secondary))       \
                 _ret_ = FWNODE_PROP_READ(_fwnode_->secondary, _propname_, _type_,       \
                                 _proptype_, _val_, _nval_);                             \
         _ret_;                                                                          \
@@ -593,7 +593,7 @@ int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
         int ret;
  
         ret = __fwnode_property_read_string_array(fwnode, propname, val, nval);
-       if (ret == -EINVAL && fwnode && fwnode->secondary)
+       if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                 ret = __fwnode_property_read_string_array(fwnode->secondary,
                                                           propname, val, nval);
         return ret;
@@ -621,7 +621,7 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
         int ret;
  
         ret = __fwnode_property_read_string(fwnode, propname, val);
-       if (ret == -EINVAL && fwnode && fwnode->secondary)
+       if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                 ret = __fwnode_property_read_string(fwnode->secondary,
                                                     propname, val);
         return ret;
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c

index cd83d477e32d412394da574e8e02adb6dd7be832..3a4b39afc0abb0cb9a7fc34845266176c95487a3 100644 (file)
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void)
         if (!all_cpu_data)
                 return -ENOMEM;
  
-       if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) {
+       if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) {
                 pr_info("intel_pstate: HWP enabled\n");
                 hwp_active++;
         }
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c

index 64f5d1bdbb48defdddc4c7ed7ee0c5b988b92e1b..8e304b1befc56385b8fc34d7697a529289d8ae3b 100644 (file)
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -176,6 +176,7 @@
  #define AT_XDMAC_MAX_CHAN      0x20
  #define AT_XDMAC_MAX_CSIZE     16      /* 16 data */
  #define AT_XDMAC_MAX_DWIDTH    8       /* 64 bits */
+#define AT_XDMAC_RESIDUE_MAX_RETRIES   5
  
  #define AT_XDMAC_DMA_BUSWIDTHS\
         (BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -1395,8 +1396,8 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
         struct at_xdmac_desc    *desc, *_desc;
         struct list_head        *descs_list;
         enum dma_status         ret;
-       int                     residue;
-       u32                     cur_nda, mask, value;
+       int                     residue, retry;
+       u32                     cur_nda, check_nda, cur_ubc, mask, value;
         u8                      dwidth = 0;
         unsigned long           flags;
  
@@ -1433,7 +1434,42 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                         cpu_relax();
         }
  
+       /*
+        * When processing the residue, we need to read two registers but we
+        * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+        * we stand in the descriptor list and AT_XDMAC_CUBC is used
+        * to know how many data are remaining for the current descriptor.
+        * Since the dma channel is not paused to not loose data, between the
+        * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+        * descriptor.
+        * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+        * still using the same descriptor by reading a second time
+        * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+        * read again AT_XDMAC_CUBC.
+        * Memory barriers are used to ensure the read order of the registers.
+        * A max number of retries is set because unlikely it can never ends if
+        * we are transferring a lot of data with small buffers.
+        */
         cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+       rmb();
+       cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+               rmb();
+               check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+
+               if (likely(cur_nda == check_nda))
+                       break;
+
+               cur_nda = check_nda;
+               rmb();
+               cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       }
+
+       if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
+               ret = DMA_ERROR;
+               goto spin_unlock;
+       }
+
         /*
          * Remove size of all microblocks already transferred and the current
          * one. Then add the remaining size to transfer of the current
@@ -1446,7 +1482,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                 if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
                         break;
         }
-       residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
+       residue += cur_ubc << dwidth;
  
         dma_set_residue(txstate, residue);
  
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c

index 2209f75fdf05bf29114f0fb3ffedff89f55b43f3..aac85c30c2cf6fc64669841c5b6abb5317df1b94 100644 (file)
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -522,6 +522,8 @@ static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
                         chan_dbg(chan, "LD %p callback\n", desc);
                         txd->callback(txd->callback_param);
                 }
+
+               dma_descriptor_unmap(txd);
         }
  
         /* Run any dependencies */
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c

index e4f43125e0fbbf5017cd848478eeb01550a87508..f039cfadf17b307c32bd88ef0e4a4a5ac749b590 100644 (file)
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1300,10 +1300,10 @@ static int iop_adma_probe(struct platform_device *pdev)
          * note: writecombine gives slightly better performance, but
          * requires that we explicitly flush the writes
          */
-       adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev,
-                                                         plat_data->pool_size,
-                                                         &adev->dma_desc_pool,
-                                                         GFP_KERNEL);
+       adev->dma_desc_pool_virt = dma_alloc_wc(&pdev->dev,
+                                               plat_data->pool_size,
+                                               &adev->dma_desc_pool,
+                                               GFP_KERNEL);
         if (!adev->dma_desc_pool_virt) {
                 ret = -ENOMEM;
                 goto err_free_adev;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c

index 14091f878f80a5ca7cd0843e70284d890ad49f0c..3922a5d5680604f831308dc02a50b93685a089ce 100644 (file)
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -964,8 +964,8 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
          * requires that we explicitly flush the writes
          */
         mv_chan->dma_desc_pool_virt =
-         dma_alloc_writecombine(&pdev->dev, MV_XOR_POOL_SIZE,
-                                &mv_chan->dma_desc_pool, GFP_KERNEL);
+         dma_alloc_wc(&pdev->dev, MV_XOR_POOL_SIZE, &mv_chan->dma_desc_pool,
+                      GFP_KERNEL);
         if (!mv_chan->dma_desc_pool_virt)
                 return ERR_PTR(-ENOMEM);
  
diff --git a/drivers/dma/qcom_bam_dma.c b/drivers/dma/qcom_bam_dma.c

index 5a250cdc83769f0d4aaff65530c76b705a474e25..d34aef7a101b669994e2a92b88e448437f0bb6f8 100644 (file)
--- a/drivers/dma/qcom_bam_dma.c
+++ b/drivers/dma/qcom_bam_dma.c
@@ -502,8 +502,8 @@ static int bam_alloc_chan(struct dma_chan *chan)
                 return 0;
  
         /* allocate FIFO descriptor space, but only if necessary */
-       bchan->fifo_virt = dma_alloc_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-                               &bchan->fifo_phys, GFP_KERNEL);
+       bchan->fifo_virt = dma_alloc_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+                                       &bchan->fifo_phys, GFP_KERNEL);
  
         if (!bchan->fifo_virt) {
                 dev_err(bdev->dev, "Failed to allocate desc fifo\n");
@@ -538,8 +538,8 @@ static void bam_free_chan(struct dma_chan *chan)
         bam_reset_channel(bchan);
         spin_unlock_irqrestore(&bchan->vc.lock, flags);
  
-       dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
-                               bchan->fifo_phys);
+       dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
+                   bchan->fifo_phys);
         bchan->fifo_virt = NULL;
  
         /* mask irq for pipe/channel */
@@ -1231,9 +1231,9 @@ static int bam_dma_remove(struct platform_device *pdev)
                 bam_dma_terminate_all(&bdev->channels[i].vc.chan);
                 tasklet_kill(&bdev->channels[i].vc.task);
  
-               dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-                       bdev->channels[i].fifo_virt,
-                       bdev->channels[i].fifo_phys);
+               dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+                           bdev->channels[i].fifo_virt,
+                           bdev->channels[i].fifo_phys);
         }
  
         tasklet_kill(&bdev->task);
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c

index e3a945ce374b8318526102852b28535948284ae7..49768c08ac0734345007a1faaae7eef70cd34f60 100644 (file)
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
         "Status Register File",
  };
  
+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+       "Load queue parity",
+       "Store queue parity",
+       "Miss address buffer payload parity",
+       "L1 TLB parity",
+       "",                                             /* reserved */
+       "DC tag error type 6",
+       "DC tag error type 1",
+       "Internal error type 1",
+       "Internal error type 2",
+       "Sys Read data error thread 0",
+       "Sys read data error thread 1",
+       "DC tag error type 2",
+       "DC data error type 1 (poison comsumption)",
+       "DC data error type 2",
+       "DC data error type 3",
+       "DC tag error type 4",
+       "L2 TLB parity",
+       "PDC parity error",
+       "DC tag error type 3",
+       "DC tag error type 5",
+       "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+       "microtag probe port parity error",
+       "IC microtag or full tag multi-hit error",
+       "IC full tag parity",
+       "IC data array parity",
+       "Decoupling queue phys addr parity error",
+       "L0 ITLB parity error",
+       "L1 ITLB parity error",
+       "L2 ITLB parity error",
+       "BPQ snoop parity on Thread 0",
+       "BPQ snoop parity on Thread 1",
+       "L1 BTB multi-match error",
+       "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+       "L2M tag multi-way-hit error",
+       "L2M tag ECC error",
+       "L2M data ECC error",
+       "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+       "uop cache tag parity error",
+       "uop cache data parity error",
+       "Insn buffer parity error",
+       "Insn dispatch queue parity error",
+       "Fetch address FIFO parity",
+       "Patch RAM data parity",
+       "Patch RAM sequencer parity",
+       "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+       "Watchdog timeout error",
+       "Phy register file parity",
+       "Flag register file parity",
+       "Immediate displacement register file parity",
+       "Address generator payload parity",
+       "EX payload parity",
+       "Checkpoint queue parity",
+       "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+       "Physical register file parity",
+       "Freelist parity error",
+       "Schedule queue parity",
+       "NSQ parity error",
+       "Retire queue parity",
+       "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+       "Shadow tag macro ECC error",
+       "Shadow tag macro multi-way-hit error",
+       "L3M tag ECC error",
+       "L3M tag multi-way-hit error",
+       "L3M data ECC error",
+       "XI parity, L3 fill done channel error",
+       "L3 victim queue parity",
+       "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+       "Illegal request from transport layer",
+       "Address violation",
+       "Security violation",
+       "Illegal response from transport layer",
+       "Unexpected response",
+       "Parity error on incoming request or probe response data",
+       "Parity error on incoming read response data",
+       "Atomic request parity",
+       "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+       "HW assert",
+       "Internal PIE register security violation",
+       "Error on GMI link",
+       "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+       "DRAM ECC error",
+       "Data poison error on DRAM",
+       "SDP parity error",
+       "Advanced peripheral bus error",
+       "Command/address parity error",
+       "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+       "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+       "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+       "SMU RAM ECC or parity error",
+};
+
  static bool f12h_mc0_mce(u16 ec, u8 xec)
  {
         bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
         pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
  }
  
+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+                                  unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       size_t len;
+
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       switch (mca_type) {
+       case SMCA_LS:
+               error_desc_array = f17h_ls_mce_desc;
+               len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+               if (xec == 0x4) {
+                       pr_cont("Unrecognized LS MCA error code.\n");
+                       return;
+               }
+               break;
+
+       case SMCA_IF:
+               error_desc_array = f17h_if_mce_desc;
+               len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+               break;
+
+       case SMCA_L2_CACHE:
+               error_desc_array = f17h_l2_mce_desc;
+               len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+               break;
+
+       case SMCA_DE:
+               error_desc_array = f17h_de_mce_desc;
+               len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+               break;
+
+       case SMCA_EX:
+               error_desc_array = f17h_ex_mce_desc;
+               len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+               break;
+
+       case SMCA_FP:
+               error_desc_array = f17h_fp_mce_desc;
+               len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+               break;
+
+       case SMCA_L3_CACHE:
+               error_desc_array = f17h_l3_mce_desc;
+               len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Corrupted MCA core error info.\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n",
+                        amd_core_mcablock_names[mca_type]);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       size_t len;
+
+       pr_emerg(HW_ERR "Data Fabric Error: ");
+
+       switch (mca_type) {
+       case  SMCA_CS:
+               error_desc_array = f17h_cs_mce_desc;
+               len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+               break;
+
+       case SMCA_PIE:
+               error_desc_array = f17h_pie_mce_desc;
+               len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Corrupted MCA Data Fabric info.\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n",
+                        amd_df_mcablock_names[mca_type]);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+       u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+       unsigned int hwid, mca_type, i;
+       u8 xec = XEC(m->status, xec_mask);
+       const char * const *error_desc_array;
+       const char *ip_name;
+       u32 low, high;
+       size_t len;
+
+       if (rdmsr_safe(addr, &low, &high)) {
+               pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+               return;
+       }
+
+       hwid = high & MCI_IPID_HWID;
+       mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+       pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+       /*
+        * Based on hwid and mca_type values, decode errors from respective IPs.
+        * Note: mca_type values make sense only in the context of an hwid.
+        */
+       for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+               if (amd_hwids[i].hwid == hwid)
+                       break;
+
+       switch (i) {
+       case SMCA_F17H_CORE:
+               ip_name = (mca_type == SMCA_L3_CACHE) ?
+                         "L3 Cache" : "F17h Core";
+               return decode_f17h_core_errors(ip_name, xec, mca_type);
+               break;
+
+       case SMCA_DF:
+               return decode_df_errors(xec, mca_type);
+               break;
+
+       case SMCA_UMC:
+               error_desc_array = f17h_umc_mce_desc;
+               len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+               break;
+
+       case SMCA_PB:
+               error_desc_array = f17h_pb_mce_desc;
+               len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+               break;
+
+       case SMCA_PSP:
+               error_desc_array = f17h_psp_mce_desc;
+               len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+               break;
+
+       case SMCA_SMU:
+               error_desc_array = f17h_smu_mce_desc;
+               len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+               break;
+
+       default:
+               pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+               return;
+       }
+
+       ip_name = amd_hwids[i].name;
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
  static inline void amd_decode_err_code(u16 ec)
  {
         if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
         struct mce *m = (struct mce *)data;
         struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
         int ecc;
+       u32 ebx = cpuid_ebx(0x80000007);
  
         if (amd_filter_mce(m))
                 return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
                 ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
                 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
  
-       if (c->x86 == 0x15 || c->x86 == 0x16)
+       if (c->x86 >= 0x15)
                 pr_cont("|%s|%s",
                         ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
                         ((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
  
+       if (!!(ebx & BIT(3))) {
+               u32 low, high;
+               u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+               if (!rdmsr_safe(addr, &low, &high) &&
+                   (low & MCI_CONFIG_MCAX))
+                       pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+       }
+
         /* do the two bits[14:13] together */
         ecc = (m->status >> 45) & 0x3;
         if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
         if (m->status & MCI_STATUS_ADDRV)
                 pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
  
+       if (!!(ebx & BIT(3))) {
+               decode_smca_errors(m);
+               goto err_code;
+       }
+
         if (!fam_ops)
                 goto err_code;
  
@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
  static int __init mce_amd_init(void)
  {
         struct cpuinfo_x86 *c = &boot_cpu_data;
+       u32 ebx;
  
         if (c->x86_vendor != X86_VENDOR_AMD)
                 return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
                 fam_ops->mc2_mce = f16h_mc2_mce;
                 break;
  
+       case 0x17:
+               ebx = cpuid_ebx(0x80000007);
+               xec_mask = 0x3f;
+               if (!(ebx & BIT(3))) {
+                       printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+                       goto err_out;
+               }
+               break;
+
         default:
                 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
-               kfree(fam_ops);
-               fam_ops = NULL;
+               goto err_out;
         }
  
         pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
         mce_register_decode_chain(&amd_mce_dec_nb);
  
         return 0;
+
+err_out:
+       kfree(fam_ops);
+       fam_ops = NULL;
+       return -EINVAL;
  }
  early_initcall(mce_amd_init);
  
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c

index e438ee5b433f3f498cba20df890730a3fc77220c..93f0d4120289fa92d01a6873fac6d2efd37337e7 100644 (file)
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1574,7 +1574,7 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes)
                                 for (cha = 0; cha < KNL_MAX_CHAS; cha++) {
                                         if (knl_get_mc_route(target,
                                                 mc_route_reg[cha]) == channel
-                                               && participants[channel]) {
+                                               && !participants[channel]) {
                                                 participant_count++;
                                                 participants[channel] = 1;
                                                 break;
@@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
                 edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n",
                          n_tads, gb, (mb*1000)/1024,
                          ((u64)tmp_mb) << 20L,
-                        (u32)TAD_SOCK(reg),
-                        (u32)TAD_CH(reg),
+                        (u32)(1 << TAD_SOCK(reg)),
+                        (u32)TAD_CH(reg) + 1,
                          (u32)TAD_TGT0(reg),
                          (u32)TAD_TGT1(reg),
                          (u32)TAD_TGT2(reg),
@@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
         }
  
         ch_way = TAD_CH(reg) + 1;
-       sck_way = TAD_SOCK(reg) + 1;
+       sck_way = 1 << TAD_SOCK(reg);
  
         if (ch_way == 3)
                 idx = addr >> 6;
@@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                  n_tads,
                  addr,
                  limit,
-                (u32)TAD_SOCK(reg),
+                sck_way,
                  ch_way,
                  offset,
                  idx,
@@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                         offset, addr);
                 return -EINVAL;
         }
-       addr -= offset;
-       /* Store the low bits [0:6] of the addr */
-       ch_addr = addr & 0x7f;
-       /* Remove socket wayness and remove 6 bits */
-       addr >>= 6;
-       addr = div_u64(addr, sck_xch);
-#if 0
-       /* Divide by channel way */
-       addr = addr / ch_way;
-#endif
-       /* Recover the last 6 bits */
-       ch_addr |= addr << 6;
+
+       ch_addr = addr - offset;
+       ch_addr >>= (6 + shiftup);
+       ch_addr /= ch_way * sck_way;
+       ch_addr <<= (6 + shiftup);
+       ch_addr |= addr & ((1 << (6 + shiftup)) - 1);
  
         /*
          * Step 3) Decode rank
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c

index 8297bc319369d6e4e4dd1579195ca1b6161a57d1..1846d65b72859284b31c536dc2fdc1a29cae0d9e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -96,7 +96,7 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
          * In practice this won't execute very often unless on very fast
          * machines because the time window for this to happen is very small.
          */
-       while (amdgpuCrtc->enabled && repcnt--) {
+       while (amdgpuCrtc->enabled && --repcnt) {
                 /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
                  * start in hpos, and to the "fudged earlier" vblank start in
                  * vpos.
@@ -112,13 +112,13 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
                         break;
  
                 /* Sleep at least until estimated real start of hw vblank */
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
                 if (min_udelay > vblank->framedur_ns / 2000) {
                         /* Don't wait ridiculously long - something is wrong */
                         repcnt = 0;
                         break;
                 }
+               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 usleep_range(min_udelay, 2 * min_udelay);
                 spin_lock_irqsave(&crtc->dev->event_lock, flags);
         };
diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c

index 21aacc1f45c1fd7afded7f6088c8a365e9005224..bf731e9f643e9ecddd367034179fae6a40d294ce 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
@@ -265,15 +265,27 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector
         unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
         unsigned lane_num, i, max_pix_clock;
  
-       for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-               for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-                       max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+       if (amdgpu_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+           ENCODER_OBJECT_ID_NUTMEG) {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       max_pix_clock = (lane_num * 270000 * 8) / bpp;
                         if (max_pix_clock >= pix_clock) {
                                 *dp_lanes = lane_num;
-                               *dp_rate = link_rates[i];
+                               *dp_rate = 270000;
                                 return 0;
                         }
                 }
+       } else {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+                               max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+                               if (max_pix_clock >= pix_clock) {
+                                       *dp_lanes = lane_num;
+                                       *dp_rate = link_rates[i];
+                                       return 0;
+                               }
+                       }
+               }
         }
  
         return -EINVAL;
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c

index e5df53b6e229ff0348541eeaa921acc6cf4700ef..1f500a1b99695a6bb83b3d596b9a2101f6f1f8ef 100644 (file)
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -109,8 +109,8 @@ struct drm_gem_cma_object *drm_gem_cma_create(struct drm_device *drm,
         if (IS_ERR(cma_obj))
                 return cma_obj;
  
-       cma_obj->vaddr = dma_alloc_writecombine(drm->dev, size,
-                       &cma_obj->paddr, GFP_KERNEL | __GFP_NOWARN);
+       cma_obj->vaddr = dma_alloc_wc(drm->dev, size, &cma_obj->paddr,
+                                     GFP_KERNEL | __GFP_NOWARN);
         if (!cma_obj->vaddr) {
                 dev_err(drm->dev, "failed to allocate buffer with size %zu\n",
                         size);
@@ -192,8 +192,8 @@ void drm_gem_cma_free_object(struct drm_gem_object *gem_obj)
         cma_obj = to_drm_gem_cma_obj(gem_obj);
  
         if (cma_obj->vaddr) {
-               dma_free_writecombine(gem_obj->dev->dev, cma_obj->base.size,
-                                     cma_obj->vaddr, cma_obj->paddr);
+               dma_free_wc(gem_obj->dev->dev, cma_obj->base.size,
+                           cma_obj->vaddr, cma_obj->paddr);
         } else if (gem_obj->import_attach) {
                 drm_prime_gem_destroy(gem_obj, cma_obj->sgt);
         }
@@ -324,9 +324,8 @@ static int drm_gem_cma_mmap_obj(struct drm_gem_cma_object *cma_obj,
         vma->vm_flags &= ~VM_PFNMAP;
         vma->vm_pgoff = 0;
  
-       ret = dma_mmap_writecombine(cma_obj->base.dev->dev, vma,
-                                   cma_obj->vaddr, cma_obj->paddr,
-                                   vma->vm_end - vma->vm_start);
+       ret = dma_mmap_wc(cma_obj->base.dev->dev, vma, cma_obj->vaddr,
+                         cma_obj->paddr, vma->vm_end - vma->vm_start);
         if (ret)
                 drm_gem_vm_close(vma);
  
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c

index a33162cf4f4cc6c4f0aee66433680a15f64cfffc..3c1ce44483d991416fed77f8945dc6686f3a3341 100644 (file)
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1113,8 +1113,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
         if (!cmdbuf)
                 return NULL;
  
-       cmdbuf->vaddr = dma_alloc_writecombine(gpu->dev, size, &cmdbuf->paddr,
-                                              GFP_KERNEL);
+       cmdbuf->vaddr = dma_alloc_wc(gpu->dev, size, &cmdbuf->paddr,
+                                    GFP_KERNEL);
         if (!cmdbuf->vaddr) {
                 kfree(cmdbuf);
                 return NULL;
@@ -1128,8 +1128,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
  
  void etnaviv_gpu_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
  {
-       dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size,
-                             cmdbuf->vaddr, cmdbuf->paddr);
+       dma_free_wc(cmdbuf->gpu->dev, cmdbuf->size, cmdbuf->vaddr,
+                   cmdbuf->paddr);
         kfree(cmdbuf);
  }
  
diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c

index 34e38749a8176f87001e54cb4ea8a7cc86fa744e..f8ee740c0e264d488b937503745ac9de06bbc9b1 100644 (file)
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -1382,8 +1382,16 @@ static void tda998x_connector_destroy(struct drm_connector *connector)
         drm_connector_cleanup(connector);
  }
  
+static int tda998x_connector_dpms(struct drm_connector *connector, int mode)
+{
+       if (drm_core_check_feature(connector->dev, DRIVER_ATOMIC))
+               return drm_atomic_helper_connector_dpms(connector, mode);
+       else
+               return drm_helper_connector_dpms(connector, mode);
+}
+
  static const struct drm_connector_funcs tda998x_connector_funcs = {
-       .dpms = drm_atomic_helper_connector_dpms,
+       .dpms = tda998x_connector_dpms,
         .reset = drm_atomic_helper_connector_reset,
         .fill_modes = drm_helper_probe_single_connector_modes,
         .detect = tda998x_connector_detect,
diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c

index 31f6d212fb1bbbfa1ce64efe8cebde2a5f9f575d..30f921421b0c944217832ba86856a6904f8fef11 100644 (file)
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -527,6 +527,8 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder)
  
         mutex_lock(&dev_priv->av_mutex);
         intel_dig_port->audio_connector = connector;
+       /* referred in audio callbacks */
+       dev_priv->dig_port_map[port] = intel_encoder;
         mutex_unlock(&dev_priv->av_mutex);
  
         if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
@@ -554,6 +556,7 @@ void intel_audio_codec_disable(struct intel_encoder *intel_encoder)
  
         mutex_lock(&dev_priv->av_mutex);
         intel_dig_port->audio_connector = NULL;
+       dev_priv->dig_port_map[port] = NULL;
         mutex_unlock(&dev_priv->av_mutex);
  
         if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c

index 0f3df2c39f7cdd3a69d8f0044cbd725e3b9bac6d..084d5586585d012891423067703fde0f9daec0b1 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -3358,7 +3358,6 @@ void intel_ddi_init(struct drm_device *dev, enum port port)
         intel_encoder->get_config = intel_ddi_get_config;
  
         intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
         intel_dig_port->saved_port_bits = I915_READ(DDI_BUF_CTL(port)) &
                                           (DDI_BUF_PORT_REVERSAL |
                                            DDI_A_4_LANES);
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c

index 1d8de43bed56839fd1fee3a75b6d61b6221c7c0a..cdc2c15873dcc71189ed3263aad55e3c63493ed3 100644 (file)
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -6045,7 +6045,6 @@ intel_dp_init(struct drm_device *dev,
         }
  
         intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
         intel_dig_port->dp.output_reg = output_reg;
  
         intel_encoder->type = INTEL_OUTPUT_DISPLAYPORT;
diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c

index cb5d1b15755c3b19a496105c7a8df99982e83cb9..616108c4bc3e5741f59c9a1066aa2df3b63d5fc5 100644 (file)
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -2154,7 +2154,6 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
  void intel_hdmi_init(struct drm_device *dev,
                      i915_reg_t hdmi_reg, enum port port)
  {
-       struct drm_i915_private *dev_priv = dev->dev_private;
         struct intel_digital_port *intel_dig_port;
         struct intel_encoder *intel_encoder;
         struct intel_connector *intel_connector;
@@ -2223,7 +2222,6 @@ void intel_hdmi_init(struct drm_device *dev,
                 intel_encoder->cloneable |= 1 << INTEL_OUTPUT_HDMI;
  
         intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
         intel_dig_port->hdmi.hdmi_reg = hdmi_reg;
         intel_dig_port->dp.output_reg = INVALID_MMIO_REG;
  
diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c

index deb8282c26d83f952473ae145c4fef0b3112b9f1..52fbe530fc9eac207093271a05d44dbf3515e11c 100644 (file)
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -664,6 +664,12 @@ int intel_setup_gmbus(struct drm_device *dev)
  
                 bus->adapter.algo = &gmbus_algorithm;
  
+               /*
+                * We wish to retry with bit banging
+                * after a timed out GMBUS attempt.
+                */
+               bus->adapter.retries = 1;
+
                 /* By default use a conservative clock rate */
                 bus->reg0 = pin | GMBUS_RATE_100KHZ;
  
diff --git a/drivers/gpu/drm/imx/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3-crtc.c

index 30a57185bdb4e41e7ff26a0d4af36577a218b0ea..287226311413c7036cecb02e973e6109d0b26548 100644 (file)
--- a/drivers/gpu/drm/imx/ipuv3-crtc.c
+++ b/drivers/gpu/drm/imx/ipuv3-crtc.c
@@ -64,6 +64,7 @@ static void ipu_fb_enable(struct ipu_crtc *ipu_crtc)
         /* Start DC channel and DI after IDMAC */
         ipu_dc_enable_channel(ipu_crtc->dc);
         ipu_di_enable(ipu_crtc->di);
+       drm_crtc_vblank_on(&ipu_crtc->base);
  
         ipu_crtc->enabled = 1;
  }
@@ -80,6 +81,7 @@ static void ipu_fb_disable(struct ipu_crtc *ipu_crtc)
         ipu_di_disable(ipu_crtc->di);
         ipu_plane_disable(ipu_crtc->plane[0]);
         ipu_dc_disable(ipu);
+       drm_crtc_vblank_off(&ipu_crtc->base);
  
         ipu_crtc->enabled = 0;
  }
diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c

index 591ba2f1ae03674224b4719660d547ad4e64c0b1..26bb1b626fe3d817812e18e19e0bfb64a4247c3c 100644 (file)
--- a/drivers/gpu/drm/imx/ipuv3-plane.c
+++ b/drivers/gpu/drm/imx/ipuv3-plane.c
@@ -42,6 +42,7 @@ static const uint32_t ipu_plane_formats[] = {
         DRM_FORMAT_YVYU,
         DRM_FORMAT_YUV420,
         DRM_FORMAT_YVU420,
+       DRM_FORMAT_RGB565,
  };
  
  int ipu_plane_irq(struct ipu_plane *ipu_plane)
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c

index dfebdc4aa0f24ac00f47ab3062d1e24e9b455648..85dfe3674b4138ef02e1b3f043699459d9decb25 100644 (file)
--- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
+++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
@@ -573,10 +573,9 @@ static int omap_dmm_remove(struct platform_device *dev)
  
                 kfree(omap_dmm->engines);
                 if (omap_dmm->refill_va)
-                       dma_free_writecombine(omap_dmm->dev,
-                               REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-                               omap_dmm->refill_va,
-                               omap_dmm->refill_pa);
+                       dma_free_wc(omap_dmm->dev,
+                                   REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+                                   omap_dmm->refill_va, omap_dmm->refill_pa);
                 if (omap_dmm->dummy_page)
                         __free_page(omap_dmm->dummy_page);
  
@@ -701,9 +700,9 @@ static int omap_dmm_probe(struct platform_device *dev)
         omap_dmm->dummy_pa = page_to_phys(omap_dmm->dummy_page);
  
         /* alloc refill memory */
-       omap_dmm->refill_va = dma_alloc_writecombine(&dev->dev,
-                               REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-                               &omap_dmm->refill_pa, GFP_KERNEL);
+       omap_dmm->refill_va = dma_alloc_wc(&dev->dev,
+                                          REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+                                          &omap_dmm->refill_pa, GFP_KERNEL);
         if (!omap_dmm->refill_va) {
                 dev_err(&dev->dev, "could not allocate refill memory\n");
                 goto fail;
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c

index 8495a1a4b61745524dd04045616d859b2ac566bb..359b0d7e8ef78faf6cd9d6e044cfb5088f99f0be 100644 (file)
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1330,8 +1330,8 @@ void omap_gem_free_object(struct drm_gem_object *obj)
                         omap_gem_detach_pages(obj);
  
                 if (!is_shmem(obj)) {
-                       dma_free_writecombine(dev->dev, obj->size,
-                                       omap_obj->vaddr, omap_obj->paddr);
+                       dma_free_wc(dev->dev, obj->size, omap_obj->vaddr,
+                                   omap_obj->paddr);
                 } else if (omap_obj->vaddr) {
                         vunmap(omap_obj->vaddr);
                 }
@@ -1395,8 +1395,8 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev,
                 /* attempt to allocate contiguous memory if we don't
                  * have DMM for remappign discontiguous buffers
                  */
-               omap_obj->vaddr =  dma_alloc_writecombine(dev->dev, size,
-                               &omap_obj->paddr, GFP_KERNEL);
+               omap_obj->vaddr =  dma_alloc_wc(dev->dev, size,
+                                               &omap_obj->paddr, GFP_KERNEL);
                 if (!omap_obj->vaddr) {
                         kfree(omap_obj);
  
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c

index 44ee72e04df9e953bafe64cdfdadf2f01c1f9bce..6af832545bc5b76b44e836553e72005f01012bbe 100644 (file)
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -315,15 +315,27 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector,
         unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
         unsigned lane_num, i, max_pix_clock;
  
-       for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-               for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-                       max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+       if (radeon_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+           ENCODER_OBJECT_ID_NUTMEG) {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       max_pix_clock = (lane_num * 270000 * 8) / bpp;
                         if (max_pix_clock >= pix_clock) {
                                 *dp_lanes = lane_num;
-                               *dp_rate = link_rates[i];
+                               *dp_rate = 270000;
                                 return 0;
                         }
                 }
+       } else {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+                               max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+                               if (max_pix_clock >= pix_clock) {
+                                       *dp_lanes = lane_num;
+                                       *dp_rate = link_rates[i];
+                                       return 0;
+                               }
+                       }
+               }
         }
  
         return -EINVAL;
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c

index 2b9ba03a7c1a84bc00564fe0f6391771977c2e7b..2d9196a447fdc94a49140dcad365b6d3957d43f6 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -455,7 +455,7 @@ static void radeon_flip_work_func(struct work_struct *__work)
          * In practice this won't execute very often unless on very fast
          * machines because the time window for this to happen is very small.
          */
-       while (radeon_crtc->enabled && repcnt--) {
+       while (radeon_crtc->enabled && --repcnt) {
                 /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
                  * start in hpos, and to the "fudged earlier" vblank start in
                  * vpos.
@@ -471,13 +471,13 @@ static void radeon_flip_work_func(struct work_struct *__work)
                         break;
  
                 /* Sleep at least until estimated real start of hw vblank */
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
                 if (min_udelay > vblank->framedur_ns / 2000) {
                         /* Don't wait ridiculously long - something is wrong */
                         repcnt = 0;
                         break;
                 }
+               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 usleep_range(min_udelay, 2 * min_udelay);
                 spin_lock_irqsave(&crtc->dev->event_lock, flags);
         };
diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c

index 0f14d897baf9b32974a97eaa59153363e3d8c97b..7a98823bacd1cd5da33c9e11dc7f798ae1a866d9 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -1079,6 +1079,8 @@ force:
  
         /* update display watermarks based on new power state */
         radeon_bandwidth_update(rdev);
+       /* update displays */
+       radeon_dpm_display_configuration_changed(rdev);
  
         /* wait for the rings to drain */
         for (i = 0; i < RADEON_NUM_RINGS; i++) {
@@ -1095,9 +1097,6 @@ force:
  
         radeon_dpm_post_set_power_state(rdev);
  
-       /* update displays */
-       radeon_dpm_display_configuration_changed(rdev);
-
         rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
         rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
         rdev->pm.dpm.single_display = single_display;
diff --git a/drivers/gpu/drm/sti/sti_cursor.c b/drivers/gpu/drm/sti/sti_cursor.c

index 807863106b8da813949383f291830c60d087d4e0..bd736ace3f8147b25c3362cb2e46aeeef2ef87b8 100644 (file)
--- a/drivers/gpu/drm/sti/sti_cursor.c
+++ b/drivers/gpu/drm/sti/sti_cursor.c
@@ -157,17 +157,15 @@ static void sti_cursor_atomic_update(struct drm_plane *drm_plane,
                 cursor->height = src_h;
  
                 if (cursor->pixmap.base)
-                       dma_free_writecombine(cursor->dev,
-                                             cursor->pixmap.size,
-                                             cursor->pixmap.base,
-                                             cursor->pixmap.paddr);
+                       dma_free_wc(cursor->dev, cursor->pixmap.size,
+                                   cursor->pixmap.base, cursor->pixmap.paddr);
  
                 cursor->pixmap.size = cursor->width * cursor->height;
  
-               cursor->pixmap.base = dma_alloc_writecombine(cursor->dev,
-                                                       cursor->pixmap.size,
-                                                       &cursor->pixmap.paddr,
-                                                       GFP_KERNEL | GFP_DMA);
+               cursor->pixmap.base = dma_alloc_wc(cursor->dev,
+                                                  cursor->pixmap.size,
+                                                  &cursor->pixmap.paddr,
+                                                  GFP_KERNEL | GFP_DMA);
                 if (!cursor->pixmap.base) {
                         DRM_ERROR("Failed to allocate memory for pixmap\n");
                         return;
@@ -252,8 +250,8 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
  
         /* Allocate clut buffer */
         size = 0x100 * sizeof(unsigned short);
-       cursor->clut = dma_alloc_writecombine(dev, size, &cursor->clut_paddr,
-                                             GFP_KERNEL | GFP_DMA);
+       cursor->clut = dma_alloc_wc(dev, size, &cursor->clut_paddr,
+                                   GFP_KERNEL | GFP_DMA);
  
         if (!cursor->clut) {
                 DRM_ERROR("Failed to allocate memory for cursor clut\n");
@@ -286,7 +284,7 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
         return &cursor->plane.drm_plane;
  
  err_plane:
-       dma_free_writecombine(dev, size, cursor->clut, cursor->clut_paddr);
+       dma_free_wc(dev, size, cursor->clut, cursor->clut_paddr);
  err_clut:
         devm_kfree(dev, cursor);
         return NULL;
diff --git a/drivers/gpu/drm/sti/sti_gdp.c b/drivers/gpu/drm/sti/sti_gdp.c

index f9a1d92c9d9519c3ddbeefe33277f26af4f37a32..514551c857bbda87b8fcbbca215bdc17c7bdfe67 100644 (file)
--- a/drivers/gpu/drm/sti/sti_gdp.c
+++ b/drivers/gpu/drm/sti/sti_gdp.c
@@ -312,8 +312,7 @@ static void sti_gdp_init(struct sti_gdp *gdp)
         /* Allocate all the nodes within a single memory page */
         size = sizeof(struct sti_gdp_node) *
             GDP_NODE_PER_FIELD * GDP_NODE_NB_BANK;
-       base = dma_alloc_writecombine(gdp->dev,
-                                     size, &dma_addr, GFP_KERNEL | GFP_DMA);
+       base = dma_alloc_wc(gdp->dev, size, &dma_addr, GFP_KERNEL | GFP_DMA);
  
         if (!base) {
                 DRM_ERROR("Failed to allocate memory for GDP node\n");
diff --git a/drivers/gpu/drm/sti/sti_hqvdp.c b/drivers/gpu/drm/sti/sti_hqvdp.c

index 43861b52261d7e4fee17839351577e5d4b90080f..1d3c3d029603d030ec02809fd14026d6d5df08da 100644 (file)
--- a/drivers/gpu/drm/sti/sti_hqvdp.c
+++ b/drivers/gpu/drm/sti/sti_hqvdp.c
@@ -617,9 +617,9 @@ static void sti_hqvdp_init(struct sti_hqvdp *hqvdp)
  
         /* Allocate memory for the VDP commands */
         size = NB_VDP_CMD * sizeof(struct sti_hqvdp_cmd);
-       hqvdp->hqvdp_cmd = dma_alloc_writecombine(hqvdp->dev, size,
-                                        &hqvdp->hqvdp_cmd_paddr,
-                                        GFP_KERNEL | GFP_DMA);
+       hqvdp->hqvdp_cmd = dma_alloc_wc(hqvdp->dev, size,
+                                       &hqvdp->hqvdp_cmd_paddr,
+                                       GFP_KERNEL | GFP_DMA);
         if (!hqvdp->hqvdp_cmd) {
                 DRM_ERROR("Failed to allocate memory for VDP cmd\n");
                 return;
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c

index 33add93b4ed96826a3f7da9bfc5d11d0018eff8b..3b0d8c392b707e93e56a0b391345d58d3a5eef4c 100644 (file)
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -175,8 +175,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
                 sg_free_table(bo->sgt);
                 kfree(bo->sgt);
         } else if (bo->vaddr) {
-               dma_free_writecombine(drm->dev, bo->gem.size, bo->vaddr,
-                                     bo->paddr);
+               dma_free_wc(drm->dev, bo->gem.size, bo->vaddr, bo->paddr);
         }
  }
  
@@ -233,8 +232,8 @@ static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
         } else {
                 size_t size = bo->gem.size;
  
-               bo->vaddr = dma_alloc_writecombine(drm->dev, size, &bo->paddr,
-                                                  GFP_KERNEL | __GFP_NOWARN);
+               bo->vaddr = dma_alloc_wc(drm->dev, size, &bo->paddr,
+                                        GFP_KERNEL | __GFP_NOWARN);
                 if (!bo->vaddr) {
                         dev_err(drm->dev,
                                 "failed to allocate buffer of size %zu\n",
@@ -472,8 +471,8 @@ int tegra_drm_mmap(struct file *file, struct vm_area_struct *vma)
                 vma->vm_flags &= ~VM_PFNMAP;
                 vma->vm_pgoff = 0;
  
-               ret = dma_mmap_writecombine(gem->dev->dev, vma, bo->vaddr,
-                                           bo->paddr, gem->size);
+               ret = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->paddr,
+                                 gem->size);
                 if (ret) {
                         drm_gem_vm_close(vma);
                         return ret;
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c

index 22278bcfc60eac4ed40fac0e31dda840aa4b703e..034ef2de903769f521c50c6a6936a5099e69578f 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -398,9 +398,8 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
         vma->vm_flags &= ~VM_PFNMAP;
         vma->vm_pgoff = 0;
  
-       ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
-                                   bo->base.vaddr, bo->base.paddr,
-                                   vma->vm_end - vma->vm_start);
+       ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
+                         bo->base.paddr, vma->vm_end - vma->vm_start);
         if (ret)
                 drm_gem_vm_close(vma);
  
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c

index db082bea8dafd023fa6ad19719f7fadf5ebfcad5..c5a1a08b0449004670b79947e68f0ba7fb8afb50 100644 (file)
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
@@ -563,6 +563,8 @@ static void vmw_sou_connector_destroy(struct drm_connector *connector)
  
  static const struct drm_connector_funcs vmw_sou_connector_funcs = {
         .dpms = vmw_du_connector_dpms,
+       .detect = vmw_du_connector_detect,
+       .fill_modes = vmw_du_connector_fill_modes,
         .set_property = vmw_du_connector_set_property,
         .destroy = vmw_sou_connector_destroy,
  };
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c

index 5a8c8d55317ab4f622c5754f509d5d3660a3e47c..a18db4d5347cefa99847df944be3a3b364de089a 100644 (file)
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -52,8 +52,8 @@ static void host1x_pushbuffer_destroy(struct push_buffer *pb)
         struct host1x *host1x = cdma_to_host1x(cdma);
  
         if (pb->phys != 0)
-               dma_free_writecombine(host1x->dev, pb->size_bytes + 4,
-                                     pb->mapped, pb->phys);
+               dma_free_wc(host1x->dev, pb->size_bytes + 4, pb->mapped,
+                           pb->phys);
  
         pb->mapped = NULL;
         pb->phys = 0;
@@ -76,8 +76,8 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)
         pb->pos = 0;
  
         /* allocate and map pushbuffer memory */
-       pb->mapped = dma_alloc_writecombine(host1x->dev, pb->size_bytes + 4,
-                                           &pb->phys, GFP_KERNEL);
+       pb->mapped = dma_alloc_wc(host1x->dev, pb->size_bytes + 4, &pb->phys,
+                                 GFP_KERNEL);
         if (!pb->mapped)
                 goto fail;
  
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c

index 63bd63f3c7dfd2da2fd3d9ae58bba1473bceb70b..defa7995f2131a06ddc88e09d13d914f8a244ea1 100644 (file)
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -467,9 +467,8 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
                 size += g->words * sizeof(u32);
         }
  
-       job->gather_copy_mapped = dma_alloc_writecombine(dev, size,
-                                                        &job->gather_copy,
-                                                        GFP_KERNEL);
+       job->gather_copy_mapped = dma_alloc_wc(dev, size, &job->gather_copy,
+                                              GFP_KERNEL);
         if (!job->gather_copy_mapped) {
                 job->gather_copy_mapped = NULL;
                 return -ENOMEM;
@@ -578,9 +577,8 @@ void host1x_job_unpin(struct host1x_job *job)
         job->num_unpins = 0;
  
         if (job->gather_copy_size)
-               dma_free_writecombine(job->channel->dev, job->gather_copy_size,
-                                     job->gather_copy_mapped,
-                                     job->gather_copy);
+               dma_free_wc(job->channel->dev, job->gather_copy_size,
+                           job->gather_copy_mapped, job->gather_copy);
  }
  EXPORT_SYMBOL(host1x_job_unpin);
  
diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c

index f2e13eb8339ffc1287110e247e0e54e922b7d72a..e00db3f510dd425c62565d913c937c5638a072d5 100644 (file)
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -1050,6 +1050,17 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
         for (i = 0; i < ARRAY_SIZE(client_reg); i++) {
                 const struct ipu_platform_reg *reg = &client_reg[i];
                 struct platform_device *pdev;
+               struct device_node *of_node;
+
+               /* Associate subdevice with the corresponding port node */
+               of_node = of_graph_get_port_by_id(dev->of_node, i);
+               if (!of_node) {
+                       dev_info(dev,
+                                "no port@%d node in %s, not using %s%d\n",
+                                i, dev->of_node->full_name,
+                                (i / 2) ? "DI" : "CSI", i % 2);
+                       continue;
+               }
  
                 pdev = platform_device_alloc(reg->name, id++);
                 if (!pdev) {
@@ -1057,17 +1068,9 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
                         goto err_register;
                 }
  
+               pdev->dev.of_node = of_node;
                 pdev->dev.parent = dev;
  
-               /* Associate subdevice with the corresponding port node */
-               pdev->dev.of_node = of_graph_get_port_by_id(dev->of_node, i);
-               if (!pdev->dev.of_node) {
-                       dev_err(dev, "missing port@%d node in %s\n", i,
-                               dev->of_node->full_name);
-                       ret = -ENODEV;
-                       goto err_register;
-               }
-
                 ret = platform_device_add_data(pdev, &reg->pdata,
                                                sizeof(reg->pdata));
                 if (!ret)
@@ -1289,10 +1292,6 @@ static int ipu_probe(struct platform_device *pdev)
         ipu->irq_sync = irq_sync;
         ipu->irq_err = irq_err;
  
-       ret = ipu_irq_init(ipu);
-       if (ret)
-               goto out_failed_irq;
-
         ret = device_reset(&pdev->dev);
         if (ret) {
                 dev_err(&pdev->dev, "failed to reset: %d\n", ret);
@@ -1302,6 +1301,10 @@ static int ipu_probe(struct platform_device *pdev)
         if (ret)
                 goto out_failed_reset;
  
+       ret = ipu_irq_init(ipu);
+       if (ret)
+               goto out_failed_irq;
+
         /* Set MCU_T to divide MCU access window into 2 */
         ipu_cm_write(ipu, 0x00400000L | (IPU_MCU_T_DEFAULT << 18),
                         IPU_DISP_GEN);
@@ -1324,9 +1327,9 @@ static int ipu_probe(struct platform_device *pdev)
  failed_add_clients:
         ipu_submodules_exit(ipu);
  failed_submodules_init:
-out_failed_reset:
         ipu_irq_exit(ipu);
  out_failed_irq:
+out_failed_reset:
         clk_disable_unprepare(ipu->clk);
         return ret;
  }
diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c

index 7dae0ac0f3aec12c5d44875111d3a69818e43c92..e9219f528d7e8ada4876649c28b9d0c54ce68117 100644 (file)
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -20,6 +20,9 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   */
  
+/* We need to access legacy defines from linux/media.h */
+#define __NEED_MEDIA_LEGACY_API
+
  #include <linux/compat.h>
  #include <linux/export.h>
  #include <linux/idr.h>
@@ -115,6 +118,26 @@ static long media_device_enum_entities(struct media_device *mdev,
         u_ent.group_id = 0;             /* Unused */
         u_ent.pads = ent->num_pads;
         u_ent.links = ent->num_links - ent->num_backlinks;
+
+       /*
+        * Workaround for a bug at media-ctl <= v1.10 that makes it to
+        * do the wrong thing if the entity function doesn't belong to
+        * either MEDIA_ENT_F_OLD_BASE or MEDIA_ENT_F_OLD_SUBDEV_BASE
+        * Ranges.
+        *
+        * Non-subdevices are expected to be at the MEDIA_ENT_F_OLD_BASE,
+        * or, otherwise, will be silently ignored by media-ctl when
+        * printing the graphviz diagram. So, map them into the devnode
+        * old range.
+        */
+       if (ent->function < MEDIA_ENT_F_OLD_BASE ||
+           ent->function > MEDIA_ENT_T_DEVNODE_UNKNOWN) {
+               if (is_media_entity_v4l2_subdev(ent))
+                       u_ent.type = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN;
+               else if (ent->function != MEDIA_ENT_F_IO_V4L)
+                       u_ent.type = MEDIA_ENT_T_DEVNODE_UNKNOWN;
+       }
+
         memcpy(&u_ent.raw, &ent->info, sizeof(ent->info));
         if (copy_to_user(uent, &u_ent, sizeof(u_ent)))
                 return -EFAULT;
diff --git a/drivers/media/platform/coda/coda-bit.c b/drivers/media/platform/coda/coda-bit.c

index 7d28899f89ce16c00cd9f00f424a948fa57e2ae8..38aacc7fc692024b58afd94708218b022bafd318 100644 (file)
--- a/drivers/media/platform/coda/coda-bit.c
+++ b/drivers/media/platform/coda/coda-bit.c
@@ -1455,9 +1455,9 @@ static int coda_alloc_bitstream_buffer(struct coda_ctx *ctx,
                 return 0;
  
         ctx->bitstream.size = roundup_pow_of_two(q_data->sizeimage * 2);
-       ctx->bitstream.vaddr = dma_alloc_writecombine(
-                       &ctx->dev->plat_dev->dev, ctx->bitstream.size,
-                       &ctx->bitstream.paddr, GFP_KERNEL);
+       ctx->bitstream.vaddr = dma_alloc_wc(&ctx->dev->plat_dev->dev,
+                                           ctx->bitstream.size,
+                                           &ctx->bitstream.paddr, GFP_KERNEL);
         if (!ctx->bitstream.vaddr) {
                 v4l2_err(&ctx->dev->v4l2_dev,
                          "failed to allocate bitstream ringbuffer");
@@ -1474,8 +1474,8 @@ static void coda_free_bitstream_buffer(struct coda_ctx *ctx)
         if (ctx->bitstream.vaddr == NULL)
                 return;
  
-       dma_free_writecombine(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
-                             ctx->bitstream.vaddr, ctx->bitstream.paddr);
+       dma_free_wc(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
+                   ctx->bitstream.vaddr, ctx->bitstream.paddr);
         ctx->bitstream.vaddr = NULL;
         kfifo_init(&ctx->bitstream_fifo, NULL, 0);
  }
diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c

index 11fdadc68e53e57722b4d58892fdf2c644beba34..2a6eaf1122b4e9b742eb3777fb4b6b317c07b201 100644 (file)
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
         CT_EXEC_USERSPACE,
         CT_ACCESS_USERSPACE,
         CT_WRITE_RO,
+       CT_WRITE_RO_AFTER_INIT,
         CT_WRITE_KERN,
  };
  
@@ -140,6 +141,7 @@ static char* cp_type[] = {
         "EXEC_USERSPACE",
         "ACCESS_USERSPACE",
         "WRITE_RO",
+       "WRITE_RO_AFTER_INIT",
         "WRITE_KERN",
  };
  
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
  static u8 data_area[EXEC_SIZE];
  
  static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
  
  module_param(recur_count, int, 0644);
  MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which)
                 break;
         }
         case CT_WRITE_RO: {
-               unsigned long *ptr;
+               /* Explicitly cast away "const" for the test. */
+               unsigned long *ptr = (unsigned long *)&rodata;
  
-               ptr = (unsigned long *)&rodata;
+               pr_info("attempting bad rodata write at %p\n", ptr);
+               *ptr ^= 0xabcd1234;
  
-               pr_info("attempting bad write at %p\n", ptr);
+               break;
+       }
+       case CT_WRITE_RO_AFTER_INIT: {
+               unsigned long *ptr = &ro_after_init;
+
+               /*
+                * Verify we were written to during init. Since an Oops
+                * is considered a "success", a failure is to just skip the
+                * real test.
+                */
+               if ((*ptr & 0xAA) != 0xAA) {
+                       pr_info("%p was NOT written during init!?\n", ptr);
+                       break;
+               }
+
+               pr_info("attempting bad ro_after_init write at %p\n", ptr);
                 *ptr ^= 0xabcd1234;
  
                 break;
@@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void)
         int n_debugfs_entries = 1; /* Assume only the direct entry */
         int i;
  
+       /* Make sure we can write to __ro_after_init values during __init */
+       ro_after_init |= 0xAA;
+
         /* Register debugfs interface */
         lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
         if (!lkdtm_debugfs_root) {
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c

index 79316159eec63cbca36aa06d27bfaab561b62dc0..88b6c81cebbe68123da96775b3a42c3f96da4a11 100644 (file)
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -187,7 +187,7 @@ static int double_bit_error_detect(void *error_data, void *error_ecc,
         __nand_calculate_ecc(error_data, size, calc_ecc);
         ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size);
  
-       return (ret == -1) ? 0 : -EINVAL;
+       return (ret == -EBADMSG) ? 0 : -EINVAL;
  }
  
  static const struct nand_ecc_test nand_ecc_test[] = {
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c

index b0045a505dc88c9ef56dcc1094a198bd011d24a7..95825b38559addb8a0dc5cca22bc305e228e65f6 100644 (file)
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -55,7 +55,7 @@ static int e820_pmem_probe(struct platform_device *pdev)
         for (p = iomem_resource.child; p ; p = p->sibling) {
                 struct nd_region_desc ndr_desc;
  
-               if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0)
+               if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
                         continue;
  
                 memset(&ndr_desc, 0, sizeof(ndr_desc));
diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c

index a656d9e8334302c2ec4f013ad6c016034ace5f27..21905fef2cbf0d34259e37069e50140a66101441 100644 (file)
--- a/drivers/parisc/eisa_enumerator.c
+++ b/drivers/parisc/eisa_enumerator.c
@@ -91,7 +91,7 @@ static int configure_memory(const unsigned char *buf,
         for (i=0;i<HPEE_MEMORY_MAX_ENT;i++) {
                 c = get_8(buf+len);
                 
-               if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+               if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
                         int result;
                         
                         res->name = name;
@@ -183,7 +183,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent,
         for (i=0;i<HPEE_PORT_MAX_ENT;i++) {
                 c = get_8(buf+len);
                 
-               if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+               if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
                         res->name = board;
                         res->start = get_16(buf+len+1);
                         res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c

index 602eb422351060c611967538359b8409885397a1..f89db3af0607263648c9a64432bcb1efdaeb8326 100644 (file)
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4772,8 +4772,10 @@ int pci_get_new_domain_nr(void)
  void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
  {
         static int use_dt_domains = -1;
-       int domain = of_get_pci_domain_nr(parent->of_node);
+       int domain = -1;
  
+       if (parent)
+               domain = of_get_pci_domain_nr(parent->of_node);
         /*
          * Check DT domain and use_dt_domains values.
          *
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c

index d7b87c64b7cd261c6ec6a1b22ddce040747679d9..e220edc85c68a12bda653b0177255cf38459c7d2 100644 (file)
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -117,7 +117,7 @@ int rio_request_inb_mbox(struct rio_mport *mport,
         if (mport->ops->open_inb_mbox == NULL)
                 goto out;
  
-       res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_mbox_res(res, mbox, mbox);
@@ -185,7 +185,7 @@ int rio_request_outb_mbox(struct rio_mport *mport,
         if (mport->ops->open_outb_mbox == NULL)
                 goto out;
  
-       res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_mbox_res(res, mbox, mbox);
@@ -285,7 +285,7 @@ int rio_request_inb_dbell(struct rio_mport *mport,
  {
         int rc = 0;
  
-       struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_dbell_res(res, start, end);
@@ -360,7 +360,7 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end)
  struct resource *rio_request_outb_dbell(struct rio_dev *rdev, u16 start,
                                         u16 end)
  {
-       struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_dbell_res(res, start, end);
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c

index cb61f300f8b5d111ce7871a39be43a1b7fe3ca80..277b5c8c825ca4a63864a01ba67cbd8765b5a041 100644 (file)
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -67,7 +67,7 @@ static const u8 DASD_DIAG_CMS1[] = { 0xc3, 0xd4, 0xe2, 0xf1 };/* EBCDIC CMS1 */
   * and function code cmd.
   * In case of an exception return 3. Otherwise return result of bitwise OR of
   * resulting condition code and DIAG return code. */
-static inline int dia250(void *iob, int cmd)
+static inline int __dia250(void *iob, int cmd)
  {
         register unsigned long reg2 asm ("2") = (unsigned long) iob;
         typedef union {
@@ -77,7 +77,6 @@ static inline int dia250(void *iob, int cmd)
         int rc;
  
         rc = 3;
-       diag_stat_inc(DIAG_STAT_X250);
         asm volatile(
                 "       diag    2,%2,0x250\n"
                 "0:     ipm     %0\n"
@@ -91,6 +90,12 @@ static inline int dia250(void *iob, int cmd)
         return rc;
  }
  
+static inline int dia250(void *iob, int cmd)
+{
+       diag_stat_inc(DIAG_STAT_X250);
+       return __dia250(iob, cmd);
+}
+
  /* Initialize block I/O to DIAG device using the specified blocksize and
   * block offset. On success, return zero and set end_block to contain the
   * number of blocks on the device minus the specified offset. Return non-zero
diff --git a/drivers/sh/superhyway/superhyway.c b/drivers/sh/superhyway/superhyway.c

index 2d9e7f3d56115aeaddea8eb250008dba7729d98e..bb1fb771213401a8cbf67b98011f464409ecb699 100644 (file)
--- a/drivers/sh/superhyway/superhyway.c
+++ b/drivers/sh/superhyway/superhyway.c
@@ -66,7 +66,7 @@ int superhyway_add_device(unsigned long base, struct superhyway_device *sdev,
         superhyway_read_vcr(dev, base, &dev->vcr);
  
         if (!dev->resource) {
-               dev->resource = kmalloc(sizeof(struct resource), GFP_KERNEL);
+               dev->resource = kzalloc(sizeof(struct resource), GFP_KERNEL);
                 if (!dev->resource) {
                         kfree(dev);
                         return -ENOMEM;
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c

index 6a4ff27f4357eb229815c18b535327487f2df580..c688efa95e29b0561ac14be901cc4e4faa5aaf33 100644 (file)
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -204,8 +204,8 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi,
  {
         struct spi_imx_data *spi_imx = spi_master_get_devdata(master);
  
-       if (spi_imx->dma_is_inited &&
-           transfer->len > spi_imx->wml * sizeof(u32))
+       if (spi_imx->dma_is_inited && transfer->len >= spi_imx->wml &&
+           (transfer->len % spi_imx->wml) == 0)
                 return true;
         return false;
  }
@@ -919,8 +919,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
         struct dma_async_tx_descriptor *desc_tx = NULL, *desc_rx = NULL;
         int ret;
         unsigned long timeout;
-       u32 dma;
-       int left;
         struct spi_master *master = spi_imx->bitbang.master;
         struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
  
@@ -954,13 +952,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
         /* Trigger the cspi module. */
         spi_imx->dma_finished = 0;
  
-       dma = readl(spi_imx->base + MX51_ECSPI_DMA);
-       dma = dma & (~MX51_ECSPI_DMA_RXT_WML_MASK);
-       /* Change RX_DMA_LENGTH trigger dma fetch tail data */
-       left = transfer->len % spi_imx->wml;
-       if (left)
-               writel(dma | (left << MX51_ECSPI_DMA_RXT_WML_OFFSET),
-                               spi_imx->base + MX51_ECSPI_DMA);
         /*
          * Set these order to avoid potential RX overflow. The overflow may
          * happen if we enable SPI HW before starting RX DMA due to rescheduling
@@ -992,10 +983,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
                         spi_imx->devtype_data->reset(spi_imx);
                         dmaengine_terminate_all(master->dma_rx);
                 }
-               dma &= ~MX51_ECSPI_DMA_RXT_WML_MASK;
-               writel(dma |
-                      spi_imx->wml << MX51_ECSPI_DMA_RXT_WML_OFFSET,
-                      spi_imx->base + MX51_ECSPI_DMA);
         }
  
         spi_imx->dma_finished = 1;
diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c

index 79a8bc4f6cec9e32ec9c78627a2e68fc85ab5e12..7cb1b2d710c10f7aab6c245a31d34e5439dfcbf0 100644 (file)
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -749,6 +749,7 @@ static int rockchip_spi_probe(struct platform_device *pdev)
         return 0;
  
  err_register_master:
+       pm_runtime_disable(&pdev->dev);
         if (rs->dma_tx.ch)
                 dma_release_channel(rs->dma_tx.ch);
         if (rs->dma_rx.ch)
@@ -778,6 +779,8 @@ static int rockchip_spi_remove(struct platform_device *pdev)
         if (rs->dma_rx.ch)
                 dma_release_channel(rs->dma_rx.ch);
  
+       spi_master_put(master);
+
         return 0;
  }
  
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c

index 82a663ba98009fb5f286a88651c9b3a54cf68bbb..4f229e711e1c1cfc0134abb71b617f9511a6e606 100644 (file)
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -177,7 +177,6 @@ void core_tmr_abort_task(
  
                 if (!__target_check_io_state(se_cmd, se_sess, 0)) {
                         spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
-                       target_put_sess_cmd(se_cmd);
                         goto out;
                 }
                 list_del_init(&se_cmd->se_cmd_list);
diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c

index a305caea58eea717fdffaac4c5364552673ad5ae..fb75b7e5a19ab52748e4a1980d56046e15716908 100644 (file)
--- a/drivers/video/fbdev/acornfb.c
+++ b/drivers/video/fbdev/acornfb.c
@@ -1040,8 +1040,8 @@ static int acornfb_probe(struct platform_device *dev)
                  * for the framebuffer if we are not using
                  * VRAM.
                  */
-               base = dma_alloc_writecombine(current_par.dev, size, &handle,
-                                             GFP_KERNEL);
+               base = dma_alloc_wc(current_par.dev, size, &handle,
+                                   GFP_KERNEL);
                 if (base == NULL) {
                         printk(KERN_ERR "acornfb: unable to allocate screen "
                                "memory\n");
diff --git a/drivers/video/fbdev/amba-clcd-versatile.c b/drivers/video/fbdev/amba-clcd-versatile.c

index 7a8afcd4573e5c1e595550f25985ea1181367157..a8a22daa3f9d2a70a6fa1ca5f4e0f526e7380375 100644 (file)
--- a/drivers/video/fbdev/amba-clcd-versatile.c
+++ b/drivers/video/fbdev/amba-clcd-versatile.c
@@ -154,8 +154,8 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
  {
         dma_addr_t dma;
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
-                                                   &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, framesize, &dma,
+                                         GFP_KERNEL);
         if (!fb->fb.screen_base) {
                 pr_err("CLCD: unable to map framebuffer\n");
                 return -ENOMEM;
@@ -169,14 +169,12 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
  
  int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-                                    fb->fb.screen_base,
-                                    fb->fb.fix.smem_start,
-                                    fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  void versatile_clcd_remove_dma(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-                             fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c

index 9362424c2340490585fe02e4dfe950f53f2097af..fe274b5851c77ff5e59b7c88f205cbf09ec61114 100644 (file)
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -774,8 +774,8 @@ static int clcdfb_of_dma_setup(struct clcd_fb *fb)
  
  static int clcdfb_of_dma_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma, fb->fb.screen_base,
-                       fb->fb.fix.smem_start, fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  static void clcdfb_of_dma_remove(struct clcd_fb *fb)
diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c

index 19eb42b57d8742089d2faadf5c8a64ce81cc01b3..56c60e67316a57457c1741ea62d614e113d0a00e 100644 (file)
--- a/drivers/video/fbdev/atmel_lcdfb.c
+++ b/drivers/video/fbdev/atmel_lcdfb.c
@@ -414,8 +414,8 @@ static inline void atmel_lcdfb_free_video_memory(struct atmel_lcdfb_info *sinfo)
  {
         struct fb_info *info = sinfo->info;
  
-       dma_free_writecombine(info->device, info->fix.smem_len,
-                               info->screen_base, info->fix.smem_start);
+       dma_free_wc(info->device, info->fix.smem_len, info->screen_base,
+                   info->fix.smem_start);
  }
  
  /**
@@ -435,8 +435,9 @@ static int atmel_lcdfb_alloc_video_memory(struct atmel_lcdfb_info *sinfo)
                     * ((var->bits_per_pixel + 7) / 8));
         info->fix.smem_len = max(smem_len, sinfo->smem_len);
  
-       info->screen_base = dma_alloc_writecombine(info->device, info->fix.smem_len,
-                                       (dma_addr_t *)&info->fix.smem_start, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(info->device, info->fix.smem_len,
+                                        (dma_addr_t *)&info->fix.smem_start,
+                                        GFP_KERNEL);
  
         if (!info->screen_base) {
                 return -ENOMEM;
diff --git a/drivers/video/fbdev/ep93xx-fb.c b/drivers/video/fbdev/ep93xx-fb.c

index 5b1081030cbbbefb3f7a8ff1f5f7dbed03ebb2fa..75f0db25d19fef774b4903f71b43f9beb5a578f2 100644 (file)
--- a/drivers/video/fbdev/ep93xx-fb.c
+++ b/drivers/video/fbdev/ep93xx-fb.c
@@ -316,9 +316,8 @@ static int ep93xxfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
         unsigned int offset = vma->vm_pgoff << PAGE_SHIFT;
  
         if (offset < info->fix.smem_len) {
-               return dma_mmap_writecombine(info->dev, vma, info->screen_base,
-                                            info->fix.smem_start,
-                                            info->fix.smem_len);
+               return dma_mmap_wc(info->dev, vma, info->screen_base,
+                                  info->fix.smem_start, info->fix.smem_len);
         }
  
         return -EINVAL;
@@ -428,8 +427,7 @@ static int ep93xxfb_alloc_videomem(struct fb_info *info)
         /* Maximum 16bpp -> used memory is maximum x*y*2 bytes */
         fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES * 2;
  
-       virt_addr = dma_alloc_writecombine(info->dev, fb_size,
-                                          &phys_addr, GFP_KERNEL);
+       virt_addr = dma_alloc_wc(info->dev, fb_size, &phys_addr, GFP_KERNEL);
         if (!virt_addr)
                 return -ENOMEM;
  
diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c

index b63d55f481fae421cbea5cb71466401835e23a3d..1a242b1338e9a75ff4436e8532f64a37124055cd 100644 (file)
--- a/drivers/video/fbdev/gbefb.c
+++ b/drivers/video/fbdev/gbefb.c
@@ -1185,8 +1185,8 @@ static int gbefb_probe(struct platform_device *p_dev)
         } else {
                 /* try to allocate memory with the classical allocator
                  * this has high chance to fail on low memory machines */
-               gbe_mem = dma_alloc_writecombine(NULL, gbe_mem_size,
-                                                &gbe_dma_addr, GFP_KERNEL);
+               gbe_mem = dma_alloc_wc(NULL, gbe_mem_size, &gbe_dma_addr,
+                                      GFP_KERNEL);
                 if (!gbe_mem) {
                         printk(KERN_ERR "gbefb: couldn't allocate framebuffer memory\n");
                         ret = -ENOMEM;
@@ -1238,7 +1238,7 @@ static int gbefb_probe(struct platform_device *p_dev)
  out_gbe_unmap:
         arch_phys_wc_del(par->wc_cookie);
         if (gbe_dma_addr)
-               dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+               dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
  out_tiles_free:
         dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
                           (void *)gbe_tiles.cpu, gbe_tiles.dma);
@@ -1259,7 +1259,7 @@ static int gbefb_remove(struct platform_device* p_dev)
         gbe_turn_off();
         arch_phys_wc_del(par->wc_cookie);
         if (gbe_dma_addr)
-               dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+               dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
         dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
                           (void *)gbe_tiles.cpu, gbe_tiles.dma);
         release_mem_region(GBE_BASE, sizeof(struct sgi_gbe));
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c

index bb2f1e866020199d31b8ba0b1940cb9e6b87923f..76b6a7784b06c7c752ba79862c26b2edcf51f932 100644 (file)
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -937,8 +937,8 @@ static int imxfb_probe(struct platform_device *pdev)
         }
  
         fbi->map_size = PAGE_ALIGN(info->fix.smem_len);
-       info->screen_base = dma_alloc_writecombine(&pdev->dev, fbi->map_size,
-                                                  &fbi->map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(&pdev->dev, fbi->map_size,
+                                        &fbi->map_dma, GFP_KERNEL);
  
         if (!info->screen_base) {
                 dev_err(&pdev->dev, "Failed to allocate video RAM: %d\n", ret);
@@ -1005,8 +1005,8 @@ failed_cmap:
         if (pdata && pdata->exit)
                 pdata->exit(fbi->pdev);
  failed_platform_init:
-       dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-                             fbi->map_dma);
+       dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+                   fbi->map_dma);
  failed_map:
         iounmap(fbi->regs);
  failed_ioremap:
@@ -1041,8 +1041,8 @@ static int imxfb_remove(struct platform_device *pdev)
         kfree(info->pseudo_palette);
         framebuffer_release(info);
  
-       dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-                             fbi->map_dma);
+       dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+                   fbi->map_dma);
  
         iounmap(fbi->regs);
         release_mem_region(res->start, resource_size(res));
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c

index 7947634ee6b06c6f63707c87ef754d22342e975c..f91b1db262b04a2b131911009e3c69914b254013 100644 (file)
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -1336,9 +1336,8 @@ static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len,
         int retval = 0;
         dma_addr_t addr;
  
-       fbi->screen_base = dma_alloc_writecombine(fbi->device,
-                                                 mem_len,
-                                                 &addr, GFP_DMA | GFP_KERNEL);
+       fbi->screen_base = dma_alloc_wc(fbi->device, mem_len, &addr,
+                                       GFP_DMA | GFP_KERNEL);
  
         if (!fbi->screen_base) {
                 dev_err(fbi->device, "Cannot allocate %u bytes framebuffer memory\n",
@@ -1378,8 +1377,8 @@ err0:
   */
  static int mx3fb_unmap_video_memory(struct fb_info *fbi)
  {
-       dma_free_writecombine(fbi->device, fbi->fix.smem_len,
-                             fbi->screen_base, fbi->fix.smem_start);
+       dma_free_wc(fbi->device, fbi->fix.smem_len, fbi->screen_base,
+                   fbi->fix.smem_start);
  
         fbi->screen_base = NULL;
         mutex_lock(&fbi->mm_lock);
diff --git a/drivers/video/fbdev/nuc900fb.c b/drivers/video/fbdev/nuc900fb.c

index 389fa2cbb713108623ca07b67941572ccdc3fa85..6680edae4696d1b0dc4dff547dda27ca8c159ab5 100644 (file)
--- a/drivers/video/fbdev/nuc900fb.c
+++ b/drivers/video/fbdev/nuc900fb.c
@@ -396,8 +396,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
         dev_dbg(fbi->dev, "nuc900fb_map_video_memory(fbi=%p) map_size %lu\n",
                 fbi, map_size);
  
-       info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-                                                       &map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+                                        GFP_KERNEL);
  
         if (!info->screen_base)
                 return -ENOMEM;
@@ -411,8 +411,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
  static inline void nuc900fb_unmap_video_memory(struct fb_info *info)
  {
         struct nuc900fb_info *fbi = info->par;
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                             info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
  }
  
  static irqreturn_t nuc900fb_irqhandler(int irq, void *dev_id)
diff --git a/drivers/video/fbdev/omap/lcdc.c b/drivers/video/fbdev/omap/lcdc.c

index 6efa2591eaa8dfa6d72a22cd18eaf7f07c36174d..e3d9b9ea5498e0f840c5189bd1401200494d9ae2 100644 (file)
--- a/drivers/video/fbdev/omap/lcdc.c
+++ b/drivers/video/fbdev/omap/lcdc.c
@@ -612,8 +612,8 @@ static void lcdc_dma_handler(u16 status, void *data)
  
  static int alloc_palette_ram(void)
  {
-       lcdc.palette_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-               MAX_PALETTE_SIZE, &lcdc.palette_phys, GFP_KERNEL);
+       lcdc.palette_virt = dma_alloc_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
+                                        &lcdc.palette_phys, GFP_KERNEL);
         if (lcdc.palette_virt == NULL) {
                 dev_err(lcdc.fbdev->dev, "failed to alloc palette memory\n");
                 return -ENOMEM;
@@ -625,8 +625,8 @@ static int alloc_palette_ram(void)
  
  static void free_palette_ram(void)
  {
-       dma_free_writecombine(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
-                       lcdc.palette_virt, lcdc.palette_phys);
+       dma_free_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE, lcdc.palette_virt,
+                   lcdc.palette_phys);
  }
  
  static int alloc_fbmem(struct omapfb_mem_region *region)
@@ -642,8 +642,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
         if (region->size > frame_size)
                 frame_size = region->size;
         lcdc.vram_size = frame_size;
-       lcdc.vram_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-                       lcdc.vram_size, &lcdc.vram_phys, GFP_KERNEL);
+       lcdc.vram_virt = dma_alloc_wc(lcdc.fbdev->dev, lcdc.vram_size,
+                                     &lcdc.vram_phys, GFP_KERNEL);
         if (lcdc.vram_virt == NULL) {
                 dev_err(lcdc.fbdev->dev, "unable to allocate FB DMA memory\n");
                 return -ENOMEM;
@@ -660,8 +660,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
  
  static void free_fbmem(void)
  {
-       dma_free_writecombine(lcdc.fbdev->dev, lcdc.vram_size,
-                             lcdc.vram_virt, lcdc.vram_phys);
+       dma_free_wc(lcdc.fbdev->dev, lcdc.vram_size, lcdc.vram_virt,
+                   lcdc.vram_phys);
  }
  
  static int setup_fbmem(struct omapfb_mem_desc *req_md)
diff --git a/drivers/video/fbdev/pxa168fb.c b/drivers/video/fbdev/pxa168fb.c

index efb57c059997641baed972fe2b25fd85715cf291..def3a501acd64484342f2315c9b32fe697b166a5 100644 (file)
--- a/drivers/video/fbdev/pxa168fb.c
+++ b/drivers/video/fbdev/pxa168fb.c
@@ -680,8 +680,8 @@ static int pxa168fb_probe(struct platform_device *pdev)
          */
         info->fix.smem_len = PAGE_ALIGN(DEFAULT_FB_SIZE);
  
-       info->screen_base = dma_alloc_writecombine(fbi->dev, info->fix.smem_len,
-                                               &fbi->fb_start_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, info->fix.smem_len,
+                                        &fbi->fb_start_dma, GFP_KERNEL);
         if (info->screen_base == NULL) {
                 ret = -ENOMEM;
                 goto failed_free_info;
@@ -804,8 +804,8 @@ static int pxa168fb_remove(struct platform_device *pdev)
  
         irq = platform_get_irq(pdev, 0);
  
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                               info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
  
         clk_disable(fbi->clk);
  
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c

index 33b2bb315a2a462daefa5bc2c3181806747dc919..2c0487f4f805cf7d4c9153d4c966f7cc67c45c81 100644 (file)
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2446,8 +2446,8 @@ static int pxafb_remove(struct platform_device *dev)
  
         free_pages_exact(fbi->video_mem, fbi->video_mem_size);
  
-       dma_free_writecombine(&dev->dev, fbi->dma_buff_size,
-                       fbi->dma_buff, fbi->dma_buff_phys);
+       dma_free_wc(&dev->dev, fbi->dma_buff_size, fbi->dma_buff,
+                   fbi->dma_buff_phys);
  
         iounmap(fbi->mmio_base);
  
diff --git a/drivers/video/fbdev/s3c-fb.c b/drivers/video/fbdev/s3c-fb.c

index f72dd12456f962eba79eec43fb706be9dc1cdd39..5f4f696c2ecfd5e677575675be2969b1ba09947d 100644 (file)
--- a/drivers/video/fbdev/s3c-fb.c
+++ b/drivers/video/fbdev/s3c-fb.c
@@ -1105,8 +1105,7 @@ static int s3c_fb_alloc_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
  
         dev_dbg(sfb->dev, "want %u bytes for window\n", size);
  
-       fbi->screen_base = dma_alloc_writecombine(sfb->dev, size,
-                                                 &map_dma, GFP_KERNEL);
+       fbi->screen_base = dma_alloc_wc(sfb->dev, size, &map_dma, GFP_KERNEL);
         if (!fbi->screen_base)
                 return -ENOMEM;
  
@@ -1131,8 +1130,8 @@ static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
         struct fb_info *fbi = win->fbinfo;
  
         if (fbi->screen_base)
-               dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
-                             fbi->screen_base, fbi->fix.smem_start);
+               dma_free_wc(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
+                           fbi->screen_base, fbi->fix.smem_start);
  }
  
  /**
diff --git a/drivers/video/fbdev/s3c2410fb.c b/drivers/video/fbdev/s3c2410fb.c

index d6704add1601c184d217dfbe19dcaa9d91aace27..0dd86be36afbb5bdff4d268ae58c17337a98c6b9 100644 (file)
--- a/drivers/video/fbdev/s3c2410fb.c
+++ b/drivers/video/fbdev/s3c2410fb.c
@@ -645,8 +645,8 @@ static int s3c2410fb_map_video_memory(struct fb_info *info)
  
         dprintk("map_video_memory(fbi=%p) map_size %u\n", fbi, map_size);
  
-       info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-                                                  &map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+                                        GFP_KERNEL);
  
         if (info->screen_base) {
                 /* prevent initial garbage on screen */
@@ -667,8 +667,8 @@ static inline void s3c2410fb_unmap_video_memory(struct fb_info *info)
  {
         struct s3c2410fb_info *fbi = info->par;
  
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                             info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
  }
  
  static inline void modify_gpio(void __iomem *reg,
diff --git a/drivers/video/fbdev/sa1100fb.c b/drivers/video/fbdev/sa1100fb.c

index dcf774c1588965032eb93ac3b18cc822f419de18..fc2aaa5aca2347e705c6eb1623ec188b5262e498 100644 (file)
--- a/drivers/video/fbdev/sa1100fb.c
+++ b/drivers/video/fbdev/sa1100fb.c
@@ -567,8 +567,8 @@ static int sa1100fb_mmap(struct fb_info *info,
  
         if (off < info->fix.smem_len) {
                 vma->vm_pgoff += 1; /* skip over the palette */
-               return dma_mmap_writecombine(fbi->dev, vma, fbi->map_cpu,
-                                            fbi->map_dma, fbi->map_size);
+               return dma_mmap_wc(fbi->dev, vma, fbi->map_cpu, fbi->map_dma,
+                                  fbi->map_size);
         }
  
         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1099,8 +1099,8 @@ static int sa1100fb_map_video_memory(struct sa1100fb_info *fbi)
          * of the framebuffer.
          */
         fbi->map_size = PAGE_ALIGN(fbi->fb.fix.smem_len + PAGE_SIZE);
-       fbi->map_cpu = dma_alloc_writecombine(fbi->dev, fbi->map_size,
-                                             &fbi->map_dma, GFP_KERNEL);
+       fbi->map_cpu = dma_alloc_wc(fbi->dev, fbi->map_size, &fbi->map_dma,
+                                   GFP_KERNEL);
  
         if (fbi->map_cpu) {
                 fbi->fb.screen_base = fbi->map_cpu + PAGE_SIZE;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c

index 12eab503efd1bb8793fff55eca83649e1f564426..dc4305b407bf6756791ec9b31e7929c1fe5c3ca8 100644 (file)
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -257,7 +257,7 @@ static struct resource *additional_memory_resource(phys_addr_t size)
                 return NULL;
  
         res->name = "System RAM";
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
         ret = allocate_resource(&iomem_resource, res,
                                 size, 0, -1,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 4545e2e2ad45a556113eecf30b1a7d49ee7ca260..5699bbc23febaa25614c52f2b2546abe145fd69d 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -931,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
         if (bio_flags & EXTENT_BIO_TREE_LOG)
                 return 0;
  #ifdef CONFIG_X86
-       if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
+       if (static_cpu_has(X86_FEATURE_XMM4_2))
                 return 0;
  #endif
         return 1;
diff --git a/fs/dax.c b/fs/dax.c

index 711172450da642ccdcf4ac7dcceb7546faf66fad..bbb2ad78377020ac85158fb61df067aceadaafe5 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1056,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         struct file *file = vma->vm_file;
+       int error;
  
         /*
          * We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1065,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
          * saves us from having to make a call to get_block() here to look
          * up the sector.
          */
-       dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+       error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+                       true);
+
+       if (error == -ENOMEM)
+               return VM_FAULT_OOM;
+       if (error)
+               return VM_FAULT_SIGBUS;
         return VM_FAULT_NOPAGE;
  }
  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c

index e032a0423e351cabfd6fe6156cdfc9c53ebff216..4098acc701c3e66d5e8a229ed49828d69997655f 100644 (file)
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -390,6 +390,7 @@ data_copy:
                 *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
                 if (*err < 0)
                         break;
+               bh = bh->b_this_page;
         }
         if (!*err)
                 *err = block_commit_write(pagep[0], from, from + replaced_size);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c

index d211b8e18566719c873838268217b143966e5965..30c4c9ebb693faaecf249df4c913c01bbaa58f20 100644 (file)
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
  
                 pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
                           __func__, ret);
-               /* Might as well let the VFS know */
-               d_instantiate(new_dentry, d_inode(old_dentry));
-               ihold(d_inode(old_dentry));
+               /*
+                * We can't keep the target in dcache after that.
+                * For one thing, we can't afford dentry aliases for directories.
+                * For another, if there was a victim, we _can't_ set new inode
+                * for that sucker and we have to trigger mount eviction - the
+                * caller won't do it on its own since we are returning an error.
+                */
+               d_invalidate(new_dentry);
                 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                 return ret;
         }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c

index 26c2de2de13fd0ce7bb55287c58e268cf4bf37e6..b7f8eaeea5d83d8a1cb66e2d697de5341e7361cd 100644 (file)
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                                 d_rehash(newdent);
                 } else {
                         spin_lock(&dentry->d_lock);
-                       NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+                       NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
                         spin_unlock(&dentry->d_lock);
                 }
         } else {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c

index 9581d190f6e12346e70226c93458df0a1791abe0..77ebc2bc1cca112056501fe4dc160d48cc7069cd 100644 (file)
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         ret = ocfs2_inode_lock(inode, &di_bh, 1);
         if (ret < 0) {
                 mlog_errno(ret);
+               if (ret == -ENOMEM)
+                       ret = VM_FAULT_OOM;
+               else
+                       ret = VM_FAULT_SIGBUS;
                 goto out;
         }
  
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index 594f7e63b432427fd5b6448afa0a75eb0b71d558..be5568839442d1ab50bf5cae293b7b2b133b525e 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1109,27 +1109,10 @@ xlog_verify_head(
         bool                    tmp_wrapped;
  
         /*
-        * Search backwards through the log looking for the log record header
-        * block. This wraps all the way back around to the head so something is
-        * seriously wrong if we can't find it.
-        */
-       found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
-                                     rhead, wrapped);
-       if (found < 0)
-               return found;
-       if (!found) {
-               xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
-               return -EIO;
-       }
-
-       *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
-
-       /*
-        * Now that we have a tail block, check the head of the log for torn
-        * writes. Search again until we hit the tail or the maximum number of
-        * log record I/Os that could have been in flight at one time. Use a
-        * temporary buffer so we don't trash the rhead/bp pointer from the
-        * call above.
+        * Check the head of the log for torn writes. Search backwards from the
+        * head until we hit the tail or the maximum number of log record I/Os
+        * that could have been in flight at one time. Use a temporary buffer so
+        * we don't trash the rhead/bp pointers from the caller.
          */
         tmp_bp = xlog_get_bp(log, 1);
         if (!tmp_bp)
@@ -1215,6 +1198,115 @@ xlog_verify_head(
         return error;
  }
  
+/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+       struct xlog             *log,
+       xfs_daddr_t             *head_blk,
+       xfs_daddr_t             *tail_blk,
+       struct xlog_rec_header  *rhead,
+       xfs_daddr_t             rhead_blk,
+       struct xfs_buf          *bp,
+       bool                    *clean)
+{
+       struct xlog_op_header   *op_head;
+       xfs_daddr_t             umount_data_blk;
+       xfs_daddr_t             after_umount_blk;
+       int                     hblks;
+       int                     error;
+       char                    *offset;
+
+       *clean = false;
+
+       /*
+        * Look for unmount record. If we find it, then we know there was a
+        * clean unmount. Since 'i' could be the last block in the physical
+        * log, we convert to a log block before comparing to the head_blk.
+        *
+        * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+        * below. We won't want to clear the unmount record if there is one, so
+        * we pass the lsn of the unmount record rather than the block after it.
+        */
+       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+               int     h_size = be32_to_cpu(rhead->h_size);
+               int     h_version = be32_to_cpu(rhead->h_version);
+
+               if ((h_version & XLOG_VERSION_2) &&
+                   (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+                       hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+                       if (h_size % XLOG_HEADER_CYCLE_SIZE)
+                               hblks++;
+               } else {
+                       hblks = 1;
+               }
+       } else {
+               hblks = 1;
+       }
+       after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+       after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
+       if (*head_blk == after_umount_blk &&
+           be32_to_cpu(rhead->h_num_logops) == 1) {
+               umount_data_blk = rhead_blk + hblks;
+               umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
+               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               if (error)
+                       return error;
+
+               op_head = (struct xlog_op_header *)offset;
+               if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+                       /*
+                        * Set tail and last sync so that newly written log
+                        * records will point recovery to after the current
+                        * unmount record.
+                        */
+                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       *tail_blk = after_umount_blk;
+
+                       *clean = true;
+               }
+       }
+
+       return 0;
+}
+
+static void
+xlog_set_state(
+       struct xlog             *log,
+       xfs_daddr_t             head_blk,
+       struct xlog_rec_header  *rhead,
+       xfs_daddr_t             rhead_blk,
+       bool                    bump_cycle)
+{
+       /*
+        * Reset log values according to the state of the log when we
+        * crashed.  In the case where head_blk == 0, we bump curr_cycle
+        * one because the next write starts a new cycle rather than
+        * continuing the cycle of the last good log record.  At this
+        * point we have guaranteed that all partial log records have been
+        * accounted for.  Therefore, we know that the last good log record
+        * written was complete and ended exactly on the end boundary
+        * of the physical log.
+        */
+       log->l_prev_block = rhead_blk;
+       log->l_curr_block = (int)head_blk;
+       log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+       if (bump_cycle)
+               log->l_curr_cycle++;
+       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+}
+
  /*
   * Find the sync block number or the tail of the log.
   *
@@ -1238,22 +1330,20 @@ xlog_find_tail(
         xfs_daddr_t             *tail_blk)
  {
         xlog_rec_header_t       *rhead;
-       xlog_op_header_t        *op_head;
         char                    *offset = NULL;
         xfs_buf_t               *bp;
         int                     error;
-       xfs_daddr_t             umount_data_blk;
-       xfs_daddr_t             after_umount_blk;
         xfs_daddr_t             rhead_blk;
         xfs_lsn_t               tail_lsn;
-       int                     hblks;
         bool                    wrapped = false;
+       bool                    clean = false;
  
         /*
          * Find previous log record
          */
         if ((error = xlog_find_head(log, head_blk)))
                 return error;
+       ASSERT(*head_blk < INT_MAX);
  
         bp = xlog_get_bp(log, 1);
         if (!bp)
@@ -1271,99 +1361,74 @@ xlog_find_tail(
         }
  
         /*
-        * Trim the head block back to skip over torn records. We can have
-        * multiple log I/Os in flight at any time, so we assume CRC failures
-        * back through the previous several records are torn writes and skip
-        * them.
+        * Search backwards through the log looking for the log record header
+        * block. This wraps all the way back around to the head so something is
+        * seriously wrong if we can't find it.
          */
-       ASSERT(*head_blk < INT_MAX);
-       error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
-                                &rhead, &wrapped);
-       if (error)
-               goto done;
+       error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+                                     &rhead_blk, &rhead, &wrapped);
+       if (error < 0)
+               return error;
+       if (!error) {
+               xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+               return -EIO;
+       }
+       *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
  
         /*
-        * Reset log values according to the state of the log when we
-        * crashed.  In the case where head_blk == 0, we bump curr_cycle
-        * one because the next write starts a new cycle rather than
-        * continuing the cycle of the last good log record.  At this
-        * point we have guaranteed that all partial log records have been
-        * accounted for.  Therefore, we know that the last good log record
-        * written was complete and ended exactly on the end boundary
-        * of the physical log.
+        * Set the log state based on the current head record.
          */
-       log->l_prev_block = rhead_blk;
-       log->l_curr_block = (int)*head_blk;
-       log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
-       if (wrapped)
-               log->l_curr_cycle++;
-       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
-                                       BBTOB(log->l_curr_block));
-       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
-                                       BBTOB(log->l_curr_block));
+       xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+       tail_lsn = atomic64_read(&log->l_tail_lsn);
  
         /*
-        * Look for unmount record.  If we find it, then we know there
-        * was a clean unmount.  Since 'i' could be the last block in
-        * the physical log, we convert to a log block before comparing
-        * to the head_blk.
+        * Look for an unmount record at the head of the log. This sets the log
+        * state to determine whether recovery is necessary.
+        */
+       error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+                                      rhead_blk, bp, &clean);
+       if (error)
+               goto done;
+
+       /*
+        * Verify the log head if the log is not clean (e.g., we have anything
+        * but an unmount record at the head). This uses CRC verification to
+        * detect and trim torn writes. If discovered, CRC failures are
+        * considered torn writes and the log head is trimmed accordingly.
          *
-        * Save the current tail lsn to use to pass to
-        * xlog_clear_stale_blocks() below.  We won't want to clear the
-        * unmount record if there is one, so we pass the lsn of the
-        * unmount record rather than the block after it.
+        * Note that we can only run CRC verification when the log is dirty
+        * because there's no guarantee that the log data behind an unmount
+        * record is compatible with the current architecture.
          */
-       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-               int     h_size = be32_to_cpu(rhead->h_size);
-               int     h_version = be32_to_cpu(rhead->h_version);
+       if (!clean) {
+               xfs_daddr_t     orig_head = *head_blk;
  
-               if ((h_version & XLOG_VERSION_2) &&
-                   (h_size > XLOG_HEADER_CYCLE_SIZE)) {
-                       hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
-                       if (h_size % XLOG_HEADER_CYCLE_SIZE)
-                               hblks++;
-               } else {
-                       hblks = 1;
-               }
-       } else {
-               hblks = 1;
-       }
-       after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
-       after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
-       tail_lsn = atomic64_read(&log->l_tail_lsn);
-       if (*head_blk == after_umount_blk &&
-           be32_to_cpu(rhead->h_num_logops) == 1) {
-               umount_data_blk = rhead_blk + hblks;
-               umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
-               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               error = xlog_verify_head(log, head_blk, tail_blk, bp,
+                                        &rhead_blk, &rhead, &wrapped);
                 if (error)
                         goto done;
  
-               op_head = (xlog_op_header_t *)offset;
-               if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
-                       /*
-                        * Set tail and last sync so that newly written
-                        * log records will point recovery to after the
-                        * current unmount record.
-                        */
-                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                       log->l_curr_cycle, after_umount_blk);
-                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                                       log->l_curr_cycle, after_umount_blk);
-                       *tail_blk = after_umount_blk;
-
-                       /*
-                        * Note that the unmount was clean. If the unmount
-                        * was not clean, we need to know this to rebuild the
-                        * superblock counters from the perag headers if we
-                        * have a filesystem using non-persistent counters.
-                        */
-                       log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+               /* update in-core state again if the head changed */
+               if (*head_blk != orig_head) {
+                       xlog_set_state(log, *head_blk, rhead, rhead_blk,
+                                      wrapped);
+                       tail_lsn = atomic64_read(&log->l_tail_lsn);
+                       error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+                                                      rhead, rhead_blk, bp,
+                                                      &clean);
+                       if (error)
+                               goto done;
                 }
         }
  
+       /*
+        * Note that the unmount was clean. If the unmount was not clean, we
+        * need to know this to rebuild the superblock counters from the perag
+        * headers if we have a filesystem using non-persistent counters.
+        */
+       if (clean)
+               log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
         /*
          * Make sure that there are no blocks in front of the head
          * with the same cycle number as the head.  This can happen
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h

index 39e1cb201b8eaa00ffe759ce7147cc8b3647fb5d..35a52a880b2f812b382c905bf44df50290aa4f0c 100644 (file)
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -119,11 +119,6 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
  }
  #endif
  
-/*
- * Initializier
- */
-#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
-
  /*
   * Remapping spinlock architecture specific functions to the corresponding
   * queued spinlock functions.
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h

index 85f888e86761e1b71bd0de2a891a160194e256d7..034acd0c4956b59932f650e85612a9397236dc66 100644 (file)
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -32,6 +32,11 @@ typedef struct qspinlock {
         atomic_t        val;
  } arch_spinlock_t;
  
+/*
+ * Initializier
+ */
+#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
+
  /*
   * Bitfields in the atomic value:
   *
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index c4bd0e2c173c011e37f6eef7acfb9bc3920fcca6..772c784ba76301dbceca9f3c0991c626a893e076 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -256,6 +256,7 @@
         .rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {           \
                 VMLINUX_SYMBOL(__start_rodata) = .;                     \
                 *(.rodata) *(.rodata.*)                                 \
+               *(.data..ro_after_init) /* Read only after init */      \
                 *(__vermagic)           /* Kernel version magic */      \
                 . = ALIGN(8);                                           \
                 VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;         \
diff --git a/include/linux/atomic.h b/include/linux/atomic.h

index 301de78d65f75c194720cc5168359dbce6eab59f..6c502cb13c95aa2f93466687c16d23b950b231d6 100644 (file)
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -548,6 +548,27 @@ static inline int atomic_dec_if_positive(atomic_t *v)
  }
  #endif
  
+/**
+ * fetch_or - perform *ptr |= mask and return old value of *ptr
+ * @ptr: pointer to value
+ * @mask: mask to OR on the value
+ *
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#ifndef fetch_or
+#define fetch_or(ptr, mask)                                            \
+({     typeof(*(ptr)) __old, __val = *(ptr);                           \
+       for (;;) {                                                      \
+               __old = cmpxchg((ptr), __val, __val | (mask));          \
+               if (__old == __val)                                     \
+                       break;                                          \
+               __val = __old;                                          \
+       }                                                               \
+       __old;                                                          \
+})
+#endif
+
+
  #ifdef CONFIG_GENERIC_ATOMIC64
  #include <asm-generic/atomic64.h>
  #endif
diff --git a/include/linux/bio.h b/include/linux/bio.h

index cb68888241084e33af2d7e11753e048431cf7837..88bc64f00bb53cb01fe60ca87dfafd7c9c0f0f4a 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -320,11 +320,6 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
         struct bvec_iter iter = bio->bi_iter;
         int idx;
  
-       if (!bio_flagged(bio, BIO_CLONED)) {
-               *bv = bio->bi_io_vec[bio->bi_vcnt - 1];
-               return;
-       }
-
         if (unlikely(!bio_multiple_segments(bio))) {
                 *bv = bio_iovec(bio);
                 return;
diff --git a/include/linux/cache.h b/include/linux/cache.h

index 17e7e82d2aa758f9888419a9c03aa4059e16b247..1be04f8c563a0c60bdfca72a36c120ec96ef327c 100644 (file)
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
  #define SMP_CACHE_BYTES L1_CACHE_BYTES
  #endif
  
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
  #ifndef __read_mostly
  #define __read_mostly
  #endif
  
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
  #ifndef ____cacheline_aligned
  #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
  #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index 48f5aab117ae12625d041cd18555031e87c178fc..a27f4f17c382b1cbf8884ac33b8134dafe624b2d 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -263,8 +263,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
   * In contrast to ACCESS_ONCE these two macros will also work on aggregate
   * data types like structs or unions. If the size of the accessed data
   * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
- * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
- * compile-time warning.
+ * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
+ * least two memcpy()s: one for the __builtin_memcpy() and then one for
+ * the macro doing the copy of variable - '__u' allocated on the stack.
   *
   * Their two major use cases are: (1) Mediating communication between
   * process-level code and irq/NMI handlers, all running on the same CPU,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h

index 75857cda38e989e5a44150c5abb9bd1bd4872957..5e45cf930a3f53fc7071f42a5759fe3dd5fd6a31 100644 (file)
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -386,7 +386,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
         if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
                 return;
  
-       if (!ops->free)
+       if (!ops->free || !cpu_addr)
                 return;
  
         debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
@@ -641,31 +641,40 @@ static inline void dmam_release_declared_memory(struct device *dev)
  }
  #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
  
-static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
-                                          dma_addr_t *dma_addr, gfp_t gfp)
+static inline void *dma_alloc_wc(struct device *dev, size_t size,
+                                dma_addr_t *dma_addr, gfp_t gfp)
  {
         DEFINE_DMA_ATTRS(attrs);
         dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
         return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs);
  }
+#ifndef dma_alloc_writecombine
+#define dma_alloc_writecombine dma_alloc_wc
+#endif
  
-static inline void dma_free_writecombine(struct device *dev, size_t size,
-                                        void *cpu_addr, dma_addr_t dma_addr)
+static inline void dma_free_wc(struct device *dev, size_t size,
+                              void *cpu_addr, dma_addr_t dma_addr)
  {
         DEFINE_DMA_ATTRS(attrs);
         dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
         return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs);
  }
+#ifndef dma_free_writecombine
+#define dma_free_writecombine dma_free_wc
+#endif
  
-static inline int dma_mmap_writecombine(struct device *dev,
-                                       struct vm_area_struct *vma,
-                                       void *cpu_addr, dma_addr_t dma_addr,
-                                       size_t size)
+static inline int dma_mmap_wc(struct device *dev,
+                             struct vm_area_struct *vma,
+                             void *cpu_addr, dma_addr_t dma_addr,
+                             size_t size)
  {
         DEFINE_DMA_ATTRS(attrs);
         dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
         return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs);
  }
+#ifndef dma_mmap_writecombine
+#define dma_mmap_writecombine dma_mmap_wc
+#endif
  
  #ifdef CONFIG_NEED_DMA_MAP_STATE
  #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h

index c2b340e23f62d3240abc6360ab0412a82a12f21a..6d9df3f7e334c3a942728237dca471f7bb602f98 100644 (file)
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
  
+static inline unsigned long get_lock_parent_ip(void)
+{
+       unsigned long addr = CALLER_ADDR0;
+
+       if (!in_lock_functions(addr))
+               return addr;
+       addr = CALLER_ADDR1;
+       if (!in_lock_functions(addr))
+               return addr;
+       return CALLER_ADDR2;
+}
+
  #ifdef CONFIG_IRQSOFF_TRACER
    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
diff --git a/include/linux/init.h b/include/linux/init.h

index b449f378f995ae647077f521d9f7af3af9480a70..aedb254abc37204a091b790eedfde857dc22155f 100644 (file)
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -142,6 +142,10 @@ void prepare_namespace(void);
  void __init load_default_modules(void);
  int __init init_rootfs(void);
  
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
  extern void (*late_time_init)(void);
  
  extern bool initcall_debug;
diff --git a/include/linux/ioport.h b/include/linux/ioport.h

index 24bea087e7af8083b3c81596a289d70530c64285..afb45597fb5f0d3aa19f6bb9fd0818995ae6623e 100644 (file)
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -20,6 +20,7 @@ struct resource {
         resource_size_t end;
         const char *name;
         unsigned long flags;
+       unsigned long desc;
         struct resource *parent, *sibling, *child;
  };
  
@@ -49,12 +50,19 @@ struct resource {
  #define IORESOURCE_WINDOW      0x00200000      /* forwarded by bridge */
  #define IORESOURCE_MUXED       0x00400000      /* Resource is software muxed */
  
+#define IORESOURCE_EXT_TYPE_BITS 0x01000000    /* Resource extended types */
+#define IORESOURCE_SYSRAM      0x01000000      /* System RAM (modifier) */
+
  #define IORESOURCE_EXCLUSIVE   0x08000000      /* Userland may not map this resource */
+
  #define IORESOURCE_DISABLED    0x10000000
  #define IORESOURCE_UNSET       0x20000000      /* No address assigned yet */
  #define IORESOURCE_AUTO                0x40000000
  #define IORESOURCE_BUSY                0x80000000      /* Driver has marked this resource busy */
  
+/* I/O resource extended types */
+#define IORESOURCE_SYSTEM_RAM          (IORESOURCE_MEM|IORESOURCE_SYSRAM)
+
  /* PnP IRQ specific bits (IORESOURCE_BITS) */
  #define IORESOURCE_IRQ_HIGHEDGE                (1<<0)
  #define IORESOURCE_IRQ_LOWEDGE         (1<<1)
@@ -105,6 +113,22 @@ struct resource {
  /* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
  #define IORESOURCE_PCI_FIXED           (1<<4)  /* Do not move resource */
  
+/*
+ * I/O Resource Descriptors
+ *
+ * Descriptors are used by walk_iomem_res_desc() and region_intersects()
+ * for searching a specific resource range in the iomem table.  Assign
+ * a new descriptor when a resource range supports the search interfaces.
+ * Otherwise, resource.desc must be set to IORES_DESC_NONE (0).
+ */
+enum {
+       IORES_DESC_NONE                         = 0,
+       IORES_DESC_CRASH_KERNEL                 = 1,
+       IORES_DESC_ACPI_TABLES                  = 2,
+       IORES_DESC_ACPI_NV_STORAGE              = 3,
+       IORES_DESC_PERSISTENT_MEMORY            = 4,
+       IORES_DESC_PERSISTENT_MEMORY_LEGACY     = 5,
+};
  
  /* helpers to define resources */
  #define DEFINE_RES_NAMED(_start, _size, _name, _flags)                 \
@@ -113,6 +137,7 @@ struct resource {
                 .end = (_start) + (_size) - 1,                          \
                 .name = (_name),                                        \
                 .flags = (_flags),                                      \
+               .desc = IORES_DESC_NONE,                                \
         }
  
  #define DEFINE_RES_IO_NAMED(_start, _size, _name)                      \
@@ -170,6 +195,10 @@ static inline unsigned long resource_type(const struct resource *res)
  {
         return res->flags & IORESOURCE_TYPE_BITS;
  }
+static inline unsigned long resource_ext_type(const struct resource *res)
+{
+       return res->flags & IORESOURCE_EXT_TYPE_BITS;
+}
  /* True iff r1 completely contains r2 */
  static inline bool resource_contains(struct resource *r1, struct resource *r2)
  {
@@ -239,8 +268,8 @@ extern int
  walk_system_ram_res(u64 start, u64 end, void *arg,
                     int (*func)(u64, u64, void *));
  extern int
-walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg,
-              int (*func)(u64, u64, void *));
+walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
+                   void *arg, int (*func)(u64, u64, void *));
  
  /* True if any part of r1 overlaps r2 */
  static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h

index 4b9f85c963d0738e20ee7c89e2dcb28ba3cc1aaa..0fdc798e3ff795a9dffd75e36e90ce2b61141562 100644 (file)
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -1,6 +1,7 @@
  #ifndef _LINUX_KASAN_H
  #define _LINUX_KASAN_H
  
+#include <linux/sched.h>
  #include <linux/types.h>
  
  struct kmem_cache;
@@ -13,7 +14,6 @@ struct vm_struct;
  
  #include <asm/kasan.h>
  #include <asm/pgtable.h>
-#include <linux/sched.h>
  
  extern unsigned char kasan_zero_page[PAGE_SIZE];
  extern pte_t kasan_zero_pte[PTRS_PER_PTE];
@@ -43,6 +43,8 @@ static inline void kasan_disable_current(void)
  
  void kasan_unpoison_shadow(const void *address, size_t size);
  
+void kasan_unpoison_task_stack(struct task_struct *task);
+
  void kasan_alloc_pages(struct page *page, unsigned int order);
  void kasan_free_pages(struct page *page, unsigned int order);
  
@@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm);
  
  static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
  
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+
  static inline void kasan_enable_current(void) {}
  static inline void kasan_disable_current(void) {}
  
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 861f690aa79118a0203d90f896bf860e3374820e..5276fe0916fcc63a8944af0a288b1afeccc100b0 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -25,6 +25,7 @@
  #include <linux/irqflags.h>
  #include <linux/context_tracking.h>
  #include <linux/irqbypass.h>
+#include <linux/swait.h>
  #include <asm/signal.h>
  
  #include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
         int fpu_active;
         int guest_fpu_loaded, guest_xcr0_loaded;
         unsigned char fpu_counter;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
         struct pid *pid;
         int sigset_active;
         sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
  }
  #endif
  
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  {
  #ifdef __KVM_HAVE_ARCH_WQP
         return vcpu->arch.wqp;
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h

index e23121f9d82a042a9b10907fe40e468e5711f99f..59ccab297ae061ba03494ccede8eae2d7a5236a1 100644 (file)
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
  
  void clear_all_latency_tracing(struct task_struct *p);
  
+extern int sysctl_latencytop(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos);
+
  #else
  
  static inline void
diff --git a/include/linux/list.h b/include/linux/list.h

index 30cf4200ab40ee40fdae694291e36fe869b508bc..5356f4d661a721ba0446b1183e2a834f3bf3b56f 100644 (file)
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry);
  extern void list_del(struct list_head *entry);
  #endif
  
-#ifdef CONFIG_DEBUG_LIST
-/*
- * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
- * of the pages it allocates is ever passed to list_add()
- */
-extern void list_force_poison(struct list_head *entry);
-#else
-/* fallback to the less strict LIST_POISON* definitions */
-#define list_force_poison list_del
-#endif
-
  /**
   * list_replace - replace old entry by new one
   * @old : the element to be replaced
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h

index 4dca42fd32f52d17326e436c4d9fcbd86a0e8d24..d026b190c53066d25753ce98f0d7c66d864a6c0a 100644 (file)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -261,7 +261,6 @@ struct held_lock {
  /*
   * Initialization, self-test and debugging-output methods:
   */
-extern void lockdep_init(void);
  extern void lockdep_info(void);
  extern void lockdep_reset(void);
  extern void lockdep_reset_lock(struct lockdep_map *lock);
@@ -392,7 +391,6 @@ static inline void lockdep_on(void)
  # define lockdep_set_current_reclaim_state(g)  do { } while (0)
  # define lockdep_clear_current_reclaim_state() do { } while (0)
  # define lockdep_trace_alloc(g)                        do { } while (0)
-# define lockdep_init()                                do { } while (0)
  # define lockdep_info()                                do { } while (0)
  # define lockdep_init_map(lock, name, key, sub) \
                 do { (void)(name); (void)(key); } while (0)
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 516e149443397dcf712e4e62aca6fb3c78e542b6..3579d1e2fe3acd30bcf7733c356a8b018758391d 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -387,7 +387,8 @@ enum {
         REGION_MIXED,
  };
  
-int region_intersects(resource_size_t offset, size_t size, const char *type);
+int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
+                     unsigned long desc);
  
  /* Support for virtually mapped pages */
  struct page *vmalloc_to_page(const void *addr);
@@ -2138,6 +2139,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
  int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                         unsigned long pfn);
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn, pgprot_t pgprot);
  int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                         pfn_t pfn);
  int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 624b78b848b89fae506faa1f4f48ce86297ceace..944b2b37313b49bffd581963a5f13380a7c6e520 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -566,10 +566,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm)
  }
  #endif
  
-struct vm_special_mapping
-{
-       const char *name;
+struct vm_fault;
+
+struct vm_special_mapping {
+       const char *name;       /* The name, e.g. "[vdso]". */
+
+       /*
+        * If .fault is not provided, this points to a
+        * NULL-terminated array of pages that back the special mapping.
+        *
+        * This must not be NULL unless .fault is provided.
+        */
         struct page **pages;
+
+       /*
+        * If non-NULL, then this is called to resolve page faults
+        * on the special mapping.  If used, .pages is not checked.
+        */
+       int (*fault)(const struct vm_special_mapping *sm,
+                    struct vm_area_struct *vma,
+                    struct vm_fault *vmf);
  };
  
  enum tlb_flush_reason {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index f5c5a3fa2c8101cc37ea917d29dec30713ec0699..79ec7bbf01557e523aee3ed263425b6fc3360741 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -468,6 +468,7 @@ struct perf_event {
         int                             group_flags;
         struct perf_event               *group_leader;
         struct pmu                      *pmu;
+       void                            *pmu_private;
  
         enum perf_event_active_state    state;
         unsigned int                    attach_state;
@@ -1109,12 +1110,6 @@ static inline void perf_event_task_tick(void)                            { }
  static inline int perf_event_release_kernel(struct perf_event *event)  { return 0; }
  #endif
  
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
-extern bool perf_event_can_stop_tick(void);
-#else
-static inline bool perf_event_can_stop_tick(void)                      { return true; }
-#endif
-
  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
  extern void perf_restore_debug_store(void);
  #else
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h

index 907f3fd191acaa1573445a0988e5161b8e408ec3..62d44c176071833ab644d46a95ec910fb1ee57e7 100644 (file)
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer);
  void run_posix_cpu_timers(struct task_struct *task);
  void posix_cpu_timers_exit(struct task_struct *task);
  void posix_cpu_timers_exit_group(struct task_struct *task);
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk);
-
  void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
                            cputime_t *newval, cputime_t *oldval);
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index a10494a94cc30f24d65ab866771833883c6f886d..c617ea12c6b7c365f529b517859b26f8c2678e48 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active);
  static inline void update_cpu_load_nohz(int active) { }
  #endif
  
-extern unsigned long get_parent_ip(unsigned long addr);
-
  extern void dump_cpu_task(int cpu);
  
  struct seq_file;
@@ -719,6 +717,10 @@ struct signal_struct {
         /* Earliest-expiration cache. */
         struct task_cputime cputime_expires;
  
+#ifdef CONFIG_NO_HZ_FULL
+       unsigned long tick_dep_mask;
+#endif
+
         struct list_head cpu_timers[3];
  
         struct pid *tty_old_pgrp;
@@ -920,6 +922,10 @@ static inline int sched_info_on(void)
  #endif
  }
  
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
  enum cpu_idle_type {
         CPU_IDLE,
         CPU_NOT_IDLE,
@@ -1289,6 +1295,8 @@ struct sched_rt_entity {
         unsigned long timeout;
         unsigned long watchdog_stamp;
         unsigned int time_slice;
+       unsigned short on_rq;
+       unsigned short on_list;
  
         struct sched_rt_entity *back;
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -1329,10 +1337,6 @@ struct sched_dl_entity {
          * task has to wait for a replenishment to be performed at the
          * next firing of dl_timer.
          *
-        * @dl_new tells if a new instance arrived. If so we must
-        * start executing it with full runtime and reset its absolute
-        * deadline;
-        *
          * @dl_boosted tells if we are boosted due to DI. If so we are
          * outside bandwidth enforcement mechanism (but only until we
          * exit the critical section);
@@ -1340,7 +1344,7 @@ struct sched_dl_entity {
          * @dl_yielded tells if task gave up the cpu before consuming
          * all its available runtime during the last job.
          */
-       int dl_throttled, dl_new, dl_boosted, dl_yielded;
+       int dl_throttled, dl_boosted, dl_yielded;
  
         /*
          * Bandwidth enforcement timer. Each -deadline task has its
@@ -1542,6 +1546,10 @@ struct task_struct {
                 VTIME_SYS,
         } vtime_snap_whence;
  #endif
+
+#ifdef CONFIG_NO_HZ_FULL
+       unsigned long tick_dep_mask;
+#endif
         unsigned long nvcsw, nivcsw; /* context switch counts */
         u64 start_time;         /* monotonic time in nsec */
         u64 real_start_time;    /* boot based time in nsec */
@@ -2356,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
  #endif
  
  #ifdef CONFIG_NO_HZ_FULL
-extern bool sched_can_stop_tick(void);
  extern u64 scheduler_tick_max_deferment(void);
-#else
-static inline bool sched_can_stop_tick(void) { return false; }
  #endif
  
  #ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index c9e4731cf10b8e97956b160c503e447490991931..4f080ab4f2cd1199f3f5aee15e7b06fdebb61333 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
                                  void __user *buffer, size_t *lenp,
                                  loff_t *ppos);
  
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos);
+
  #endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/swait.h b/include/linux/swait.h

new file mode 100644 (file)

index 0000000..c1f9c62
--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ *    sleeper state.
+ *
+ *  - the exclusive mode; because this requires preserving the list order
+ *    and this is hard.
+ *
+ *  - custom wake functions; because you cannot give any guarantees about
+ *    random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+       raw_spinlock_t          lock;
+       struct list_head        task_list;
+};
+
+struct swait_queue {
+       struct task_struct      *task;
+       struct list_head        task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) {                               \
+       .task           = current,                                      \
+       .task_list      = LIST_HEAD_INIT((name).task_list),             \
+}
+
+#define DECLARE_SWAITQUEUE(name)                                       \
+       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
+       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
+       .task_list      = LIST_HEAD_INIT((name).task_list),             \
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
+       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                                   struct lock_class_key *key);
+
+#define init_swait_queue_head(q)                               \
+       do {                                                    \
+               static struct lock_class_key __key;             \
+               __init_swait_queue_head((q), #q, &__key);       \
+       } while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
+       ({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
+       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
+       DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+       return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd)                 \
+({                                                                     \
+       struct swait_queue __wait;                                      \
+       long __ret = ret;                                               \
+                                                                       \
+       INIT_LIST_HEAD(&__wait.task_list);                              \
+       for (;;) {                                                      \
+               long __int = prepare_to_swait_event(&wq, &__wait, state);\
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+                                                                       \
+               if (___wait_is_interruptible(state) && __int) {         \
+                       __ret = __int;                                  \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               cmd;                                                    \
+       }                                                               \
+       finish_swait(&wq, &__wait);                                     \
+       __ret;                                                          \
+})
+
+#define __swait_event(wq, condition)                                   \
+       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
+                           schedule())
+
+#define swait_event(wq, condition)                                     \
+do {                                                                   \
+       if (condition)                                                  \
+               break;                                                  \
+       __swait_event(wq, condition);                                   \
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout)                  \
+       ___swait_event(wq, ___wait_cond_timeout(condition),             \
+                     TASK_UNINTERRUPTIBLE, timeout,                    \
+                     __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout)                    \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __swait_event_timeout(wq, condition, timeout);  \
+       __ret;                                                          \
+})
+
+#define __swait_event_interruptible(wq, condition)                     \
+       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
+                     schedule())
+
+#define swait_event_interruptible(wq, condition)                       \
+({                                                                     \
+       int __ret = 0;                                                  \
+       if (!(condition))                                               \
+               __ret = __swait_event_interruptible(wq, condition);     \
+       __ret;                                                          \
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
+       ___swait_event(wq, ___wait_cond_timeout(condition),             \
+                     TASK_INTERRUPTIBLE, timeout,                      \
+                     __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)      \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __swait_event_interruptible_timeout(wq,         \
+                                               condition, timeout);    \
+       __ret;                                                          \
+})
+
+#endif /* _LINUX_SWAIT_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h

index 97fd4e543846b5f4de1dc5e366f250cc26724098..21f73649a4dc501002356a95bd3e68f52a7be717 100644 (file)
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -97,6 +97,19 @@ static inline void tick_broadcast_exit(void)
         tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT);
  }
  
+enum tick_dep_bits {
+       TICK_DEP_BIT_POSIX_TIMER        = 0,
+       TICK_DEP_BIT_PERF_EVENTS        = 1,
+       TICK_DEP_BIT_SCHED              = 2,
+       TICK_DEP_BIT_CLOCK_UNSTABLE     = 3
+};
+
+#define TICK_DEP_MASK_NONE             0
+#define TICK_DEP_MASK_POSIX_TIMER      (1 << TICK_DEP_BIT_POSIX_TIMER)
+#define TICK_DEP_MASK_PERF_EVENTS      (1 << TICK_DEP_BIT_PERF_EVENTS)
+#define TICK_DEP_MASK_SCHED            (1 << TICK_DEP_BIT_SCHED)
+#define TICK_DEP_MASK_CLOCK_UNSTABLE   (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
+
  #ifdef CONFIG_NO_HZ_COMMON
  extern int tick_nohz_enabled;
  extern int tick_nohz_tick_stopped(void);
@@ -154,9 +167,73 @@ static inline int housekeeping_any_cpu(void)
         return cpumask_any_and(housekeeping_mask, cpu_online_mask);
  }
  
-extern void tick_nohz_full_kick(void);
+extern void tick_nohz_dep_set(enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_task(struct task_struct *tsk,
+                                  enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+                                    enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit);
+
+/*
+ * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases
+ * on top of static keys.
+ */
+static inline void tick_dep_set(enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set(bit);
+}
+
+static inline void tick_dep_clear(enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear(bit);
+}
+
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_cpu(cpu))
+               tick_nohz_dep_set_cpu(cpu, bit);
+}
+
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_cpu(cpu))
+               tick_nohz_dep_clear_cpu(cpu, bit);
+}
+
+static inline void tick_dep_set_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set_task(tsk, bit);
+}
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+                                      enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear_task(tsk, bit);
+}
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set_signal(signal, bit);
+}
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+                                        enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear_signal(signal, bit);
+}
+
  extern void tick_nohz_full_kick_cpu(int cpu);
-extern void tick_nohz_full_kick_all(void);
  extern void __tick_nohz_task_switch(void);
  #else
  static inline int housekeeping_any_cpu(void)
@@ -166,9 +243,21 @@ static inline int housekeeping_any_cpu(void)
  static inline bool tick_nohz_full_enabled(void) { return false; }
  static inline bool tick_nohz_full_cpu(int cpu) { return false; }
  static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
+
+static inline void tick_dep_set(enum tick_dep_bits bit) { }
+static inline void tick_dep_clear(enum tick_dep_bits bit) { }
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_set_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+                                      enum tick_dep_bits bit) { }
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+                                        enum tick_dep_bits bit) { }
+
  static inline void tick_nohz_full_kick_cpu(int cpu) { }
-static inline void tick_nohz_full_kick(void) { }
-static inline void tick_nohz_full_kick_all(void) { }
  static inline void __tick_nohz_task_switch(void) { }
  #endif
  
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h

index acfdbf353a0b5bc7cfeb6ae58fa7e6a6acfa16ba..be586c632a0c04da3887ae6a0cf28357df98ef5c 100644 (file)
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -134,9 +134,6 @@ extern void syscall_unregfunc(void);
                 void *it_func;                                          \
                 void *__data;                                           \
                                                                         \
-               if (!cpu_online(raw_smp_processor_id()))                \
-                       return;                                         \
-                                                                       \
                 if (!(cond))                                            \
                         return;                                         \
                 prercu;                                                 \
@@ -343,15 +340,19 @@ extern void syscall_unregfunc(void);
   * "void *__data, proto" as the callback prototype.
   */
  #define DECLARE_TRACE_NOARGS(name)                                     \
-               __DECLARE_TRACE(name, void, , 1, void *__data, __data)
+       __DECLARE_TRACE(name, void, ,                                   \
+                       cpu_online(raw_smp_processor_id()),             \
+                       void *__data, __data)
  
  #define DECLARE_TRACE(name, proto, args)                               \
-               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,   \
-                               PARAMS(void *__data, proto),            \
-                               PARAMS(__data, args))
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
+                       cpu_online(raw_smp_processor_id()),             \
+                       PARAMS(void *__data, proto),                    \
+                       PARAMS(__data, args))
  
  #define DECLARE_TRACE_CONDITION(name, proto, args, cond)               \
-       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
+                       cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
                         PARAMS(void *__data, proto),                    \
                         PARAMS(__data, args))
  
diff --git a/include/linux/wait.h b/include/linux/wait.h

index ae71a769b89e3d9f706543a16cfd125120a52cb0..27d7a0ab5da3edf217e60d767c10c65d3e3c52a0 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -338,7 +338,7 @@ do {                                                                        \
                             schedule(); try_to_freeze())
  
  /**
- * wait_event - sleep (or freeze) until a condition gets true
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
   * @wq: the waitqueue to wait on
   * @condition: a C expression for the event to wait for
   *
diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h

index 317a1ed2f4acc7745c3e02955c60e8c1dc8eb7c3..9130dd5a184a25e7d6c6cf0b633d7d5c0c4fc0f6 100644 (file)
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -231,13 +231,13 @@ TRACE_EVENT(snd_soc_jack_report,
         TP_ARGS(jack, mask, val),
  
         TP_STRUCT__entry(
-               __string(       name,           jack->jack->name        )
+               __string(       name,           jack->jack->id          )
                 __field(        int,            mask                    )
                 __field(        int,            val                     )
         ),
  
         TP_fast_assign(
-               __assign_str(name, jack->jack->name);
+               __assign_str(name, jack->jack->id);
                 __entry->mask = mask;
                 __entry->val = val;
         ),
@@ -253,12 +253,12 @@ TRACE_EVENT(snd_soc_jack_notify,
         TP_ARGS(jack, val),
  
         TP_STRUCT__entry(
-               __string(       name,           jack->jack->name        )
+               __string(       name,           jack->jack->id          )
                 __field(        int,            val                     )
         ),
  
         TP_fast_assign(
-               __assign_str(name, jack->jack->name);
+               __assign_str(name, jack->jack->id);
                 __entry->val = val;
         ),
  
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h

index 073b9ac245ba0315f31a51f5df9f21bcdf2e9115..51440131d3372feb1f09b7139362db66119f0e2a 100644 (file)
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire,
  );
  
  #ifdef CONFIG_NO_HZ_COMMON
+
+#define TICK_DEP_NAMES                                 \
+               tick_dep_name(NONE)                     \
+               tick_dep_name(POSIX_TIMER)              \
+               tick_dep_name(PERF_EVENTS)              \
+               tick_dep_name(SCHED)                    \
+               tick_dep_name_end(CLOCK_UNSTABLE)
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+
+TICK_DEP_NAMES
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
+#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }
+
+#define show_tick_dep_name(val)                                \
+       __print_symbolic(val, TICK_DEP_NAMES)
+
  TRACE_EVENT(tick_stop,
  
-       TP_PROTO(int success, char *error_msg),
+       TP_PROTO(int success, int dependency),
  
-       TP_ARGS(success, error_msg),
+       TP_ARGS(success, dependency),
  
         TP_STRUCT__entry(
                 __field( int ,          success )
-               __string( msg,          error_msg )
+               __field( int ,          dependency )
         ),
  
         TP_fast_assign(
                 __entry->success        = success;
-               __assign_str(msg, error_msg);
+               __entry->dependency     = dependency;
         ),
  
-       TP_printk("success=%s msg=%s",  __entry->success ? "yes" : "no", __get_str(msg))
+       TP_printk("success=%d dependency=%s",  __entry->success, \
+                       show_tick_dep_name(__entry->dependency))
  );
  #endif
  
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h

index 625b38f65764b34fe9d942339ef61a08af817564..a8e3a8c0d85a9173c7aa248b84e8e4e9fd01739f 100644 (file)
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -120,7 +120,7 @@ struct media_device_info {
  
  #define MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN        MEDIA_ENT_F_OLD_SUBDEV_BASE
  
-#ifndef __KERNEL__
+#if !defined(__KERNEL__) || defined(__NEED_MEDIA_LEGACY_API)
  
  /*
   * Legacy symbols used to avoid userspace compilation breakages
@@ -133,6 +133,10 @@ struct media_device_info {
  #define MEDIA_ENT_TYPE_MASK            0x00ff0000
  #define MEDIA_ENT_SUBTYPE_MASK         0x0000ffff
  
+/* End of the old subdev reserved numberspace */
+#define MEDIA_ENT_T_DEVNODE_UNKNOWN    (MEDIA_ENT_T_DEVNODE | \
+                                        MEDIA_ENT_SUBTYPE_MASK)
+
  #define MEDIA_ENT_T_DEVNODE            MEDIA_ENT_F_OLD_BASE
  #define MEDIA_ENT_T_DEVNODE_V4L                MEDIA_ENT_F_IO_V4L
  #define MEDIA_ENT_T_DEVNODE_FB         (MEDIA_ENT_T_DEVNODE + 2)
diff --git a/init/main.c b/init/main.c

index 58c9e374704bb20cfff41fa92afcf7cafe498c32..7c27de4577eda65a15510af0d7f4d5cae59d57b8 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -93,9 +93,6 @@ static int kernel_init(void *);
  extern void init_IRQ(void);
  extern void fork_init(void);
  extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
  
  /*
   * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -499,11 +496,6 @@ asmlinkage __visible void __init start_kernel(void)
         char *command_line;
         char *after_dashes;
  
-       /*
-        * Need to run as early as possible, to initialize the
-        * lockdep hash:
-        */
-       lockdep_init();
         set_task_stack_end_magic(&init_task);
         smp_setup_processor_id();
         debug_objects_early_init();
@@ -929,6 +921,28 @@ static int try_to_run_init_process(const char *init_filename)
  
  static noinline void __init kernel_init_freeable(void);
  
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+       return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+       if (rodata_enabled)
+               mark_rodata_ro();
+       else
+               pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+       pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
  static int __ref kernel_init(void *unused)
  {
         int ret;
@@ -937,7 +951,7 @@ static int __ref kernel_init(void *unused)
         /* need to finish all async __init code before freeing the memory */
         async_synchronize_full();
         free_initmem();
-       mark_rodata_ro();
+       mark_readonly();
         system_state = SYSTEM_RUNNING;
         numa_default_policy();
  
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c

index e1dbf4a2c69e4ca9721c22184cb9f800325b9194..90ff129c88a27c50e33be234be695650e7210494 100644 (file)
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
         } else {
                 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                            __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
                 if (!bp->bp_type) {
                         kdb_printf("Software breakpoints are unavailable.\n"
-                                  "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+                                  "  Boot the kernel with rodata=off\n"
                                    "  OR use hw breaks: help bph\n");
                 }
-#endif
                 return 1;
         }
         return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 614614821f00a02928439b068903904d81233808..712570dddacde345c481f86b8631df75b798a33b 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3112,17 +3112,6 @@ done:
         return rotate;
  }
  
-#ifdef CONFIG_NO_HZ_FULL
-bool perf_event_can_stop_tick(void)
-{
-       if (atomic_read(&nr_freq_events) ||
-           __this_cpu_read(perf_throttled_count))
-               return false;
-       else
-               return true;
-}
-#endif
-
  void perf_event_task_tick(void)
  {
         struct list_head *head = this_cpu_ptr(&active_ctx_list);
@@ -3133,6 +3122,7 @@ void perf_event_task_tick(void)
  
         __this_cpu_inc(perf_throttled_seq);
         throttled = __this_cpu_xchg(perf_throttled_count, 0);
+       tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
  
         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
                 perf_adjust_freq_unthr_context(ctx, throttled);
@@ -3564,6 +3554,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
  }
  
+#ifdef CONFIG_NO_HZ_FULL
+static DEFINE_SPINLOCK(nr_freq_lock);
+#endif
+
+static void unaccount_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       spin_lock(&nr_freq_lock);
+       if (atomic_dec_and_test(&nr_freq_events))
+               tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
+       spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void unaccount_freq_event(void)
+{
+       if (tick_nohz_full_enabled())
+               unaccount_freq_event_nohz();
+       else
+               atomic_dec(&nr_freq_events);
+}
+
  static void unaccount_event(struct perf_event *event)
  {
         bool dec = false;
@@ -3580,7 +3592,7 @@ static void unaccount_event(struct perf_event *event)
         if (event->attr.task)
                 atomic_dec(&nr_task_events);
         if (event->attr.freq)
-               atomic_dec(&nr_freq_events);
+               unaccount_freq_event();
         if (event->attr.context_switch) {
                 dec = true;
                 atomic_dec(&nr_switch_events);
@@ -6424,9 +6436,9 @@ static int __perf_event_overflow(struct perf_event *event,
                 if (unlikely(throttle
                              && hwc->interrupts >= max_samples_per_tick)) {
                         __this_cpu_inc(perf_throttled_count);
+                       tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                         hwc->interrupts = MAX_INTERRUPTS;
                         perf_log_throttle(event, 0);
-                       tick_nohz_full_kick();
                         ret = 1;
                 }
         }
@@ -6785,7 +6797,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
         kfree_rcu(hlist, rcu_head);
  }
  
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+static void swevent_hlist_put_cpu(int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
@@ -6797,15 +6809,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
         mutex_unlock(&swhash->hlist_mutex);
  }
  
-static void swevent_hlist_put(struct perf_event *event)
+static void swevent_hlist_put(void)
  {
         int cpu;
  
         for_each_possible_cpu(cpu)
-               swevent_hlist_put_cpu(event, cpu);
+               swevent_hlist_put_cpu(cpu);
  }
  
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+static int swevent_hlist_get_cpu(int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
         int err = 0;
@@ -6828,14 +6840,13 @@ exit:
         return err;
  }
  
-static int swevent_hlist_get(struct perf_event *event)
+static int swevent_hlist_get(void)
  {
-       int err;
-       int cpu, failed_cpu;
+       int err, cpu, failed_cpu;
  
         get_online_cpus();
         for_each_possible_cpu(cpu) {
-               err = swevent_hlist_get_cpu(event, cpu);
+               err = swevent_hlist_get_cpu(cpu);
                 if (err) {
                         failed_cpu = cpu;
                         goto fail;
@@ -6848,7 +6859,7 @@ fail:
         for_each_possible_cpu(cpu) {
                 if (cpu == failed_cpu)
                         break;
-               swevent_hlist_put_cpu(event, cpu);
+               swevent_hlist_put_cpu(cpu);
         }
  
         put_online_cpus();
@@ -6864,7 +6875,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
         WARN_ON(event->parent);
  
         static_key_slow_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+       swevent_hlist_put();
  }
  
  static int perf_swevent_init(struct perf_event *event)
@@ -6895,7 +6906,7 @@ static int perf_swevent_init(struct perf_event *event)
         if (!event->parent) {
                 int err;
  
-               err = swevent_hlist_get(event);
+               err = swevent_hlist_get();
                 if (err)
                         return err;
  
@@ -7816,6 +7827,27 @@ static void account_event_cpu(struct perf_event *event, int cpu)
                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
  }
  
+/* Freq events need the tick to stay alive (see perf_event_task_tick). */
+static void account_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       /* Lock so we don't race with concurrent unaccount */
+       spin_lock(&nr_freq_lock);
+       if (atomic_inc_return(&nr_freq_events) == 1)
+               tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
+       spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void account_freq_event(void)
+{
+       if (tick_nohz_full_enabled())
+               account_freq_event_nohz();
+       else
+               atomic_inc(&nr_freq_events);
+}
+
+
  static void account_event(struct perf_event *event)
  {
         bool inc = false;
@@ -7831,10 +7863,8 @@ static void account_event(struct perf_event *event)
                 atomic_inc(&nr_comm_events);
         if (event->attr.task)
                 atomic_inc(&nr_task_events);
-       if (event->attr.freq) {
-               if (atomic_inc_return(&nr_freq_events) == 1)
-                       tick_nohz_full_kick_all();
-       }
+       if (event->attr.freq)
+               account_freq_event();
         if (event->attr.context_switch) {
                 atomic_inc(&nr_switch_events);
                 inc = true;
@@ -8001,6 +8031,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 }
         }
  
+       /* symmetric to unaccount_event() in _free_event() */
+       account_event(event);
+
         return event;
  
  err_per_task:
@@ -8364,8 +8397,6 @@ SYSCALL_DEFINE5(perf_event_open,
                 }
         }
  
-       account_event(event);
-
         /*
          * Special case software events and allow them to be part of
          * any hardware group.
@@ -8662,8 +8693,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         /* Mark owner so we could distinguish it from user events. */
         event->owner = TASK_TOMBSTONE;
  
-       account_event(event);
-
         ctx = find_get_context(event->pmu, task, event);
         if (IS_ERR(ctx)) {
                 err = PTR_ERR(ctx);
@@ -9447,6 +9476,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
  
         return 0;
  }
+EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
  
  static int __init perf_event_sysfs_init(void)
  {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c

index 0167679182c08dae79cf54e0b3830fc626866ee2..5f6ce931f1eac3aace94813d0c13cca724ae53f9 100644 (file)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
                 goto free_area;
  
         area->xol_mapping.name = "[uprobes]";
+       area->xol_mapping.fault = NULL;
         area->xol_mapping.pages = area->pages;
         area->pages[0] = alloc_page(GFP_HIGHUSER);
         if (!area->pages[0])
diff --git a/kernel/futex.c b/kernel/futex.c

index 5d6ce6413ef1d227b32c99a4330bee1289f9f571..a5d2e74c89e0b217df98326e5febf3caf687687c 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -124,16 +124,16 @@
   *   futex_wait(futex, val);
   *
   *   waiters++; (a)
- *   mb(); (A) <-- paired with -.
- *                              |
- *   lock(hash_bucket(futex));  |
- *                              |
- *   uval = *futex;             |
- *                              |        *futex = newval;
- *                              |        sys_futex(WAKE, futex);
- *                              |          futex_wake(futex);
- *                              |
- *                              `------->  mb(); (B)
+ *   smp_mb(); (A) <-- paired with -.
+ *                                  |
+ *   lock(hash_bucket(futex));      |
+ *                                  |
+ *   uval = *futex;                 |
+ *                                  |        *futex = newval;
+ *                                  |        sys_futex(WAKE, futex);
+ *                                  |          futex_wake(futex);
+ *                                  |
+ *                                  `--------> smp_mb(); (B)
   *   if (uval == val)
   *     queue();
   *     unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
         /*
          * Ensure futex_get_mm() implies a full barrier such that
          * get_futex_key() implies a full barrier. This is relied upon
-        * as full barrier (B), see the ordering comment above.
+        * as smp_mb(); (B), see the ordering comment above.
          */
         smp_mb__after_atomic();
  }
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
  
         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
         case FUT_OFF_INODE:
-               ihold(key->shared.inode); /* implies MB (B) */
+               ihold(key->shared.inode); /* implies smp_mb(); (B) */
                 break;
         case FUT_OFF_MMSHARED:
-               futex_get_mm(key); /* implies MB (B) */
+               futex_get_mm(key); /* implies smp_mb(); (B) */
                 break;
         default:
                 /*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
                  * mm, therefore the only purpose of calling get_futex_key_refs
                  * is because we need the barrier for the lockless waiter check.
                  */
-               smp_mb(); /* explicit MB (B) */
+               smp_mb(); /* explicit smp_mb(); (B) */
         }
  }
  
@@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         if (!fshared) {
                 key->private.mm = mm;
                 key->private.address = address;
-               get_futex_key_refs(key);  /* implies MB (B) */
+               get_futex_key_refs(key);  /* implies smp_mb(); (B) */
                 return 0;
         }
  
@@ -520,7 +520,20 @@ again:
         else
                 err = 0;
  
-       lock_page(page);
+       /*
+        * The treatment of mapping from this point on is critical. The page
+        * lock protects many things but in this context the page lock
+        * stabilizes mapping, prevents inode freeing in the shared
+        * file-backed region case and guards against movement to swap cache.
+        *
+        * Strictly speaking the page lock is not needed in all cases being
+        * considered here and page lock forces unnecessarily serialization
+        * From this point on, mapping will be re-verified if necessary and
+        * page lock will be acquired only if it is unavoidable
+        */
+       page = compound_head(page);
+       mapping = READ_ONCE(page->mapping);
+
         /*
          * If page->mapping is NULL, then it cannot be a PageAnon
          * page; but it might be the ZERO_PAGE or in the gate area or
@@ -536,19 +549,31 @@ again:
          * shmem_writepage move it from filecache to swapcache beneath us:
          * an unlikely race, but we do need to retry for page->mapping.
          */
-       mapping = compound_head(page)->mapping;
-       if (!mapping) {
-               int shmem_swizzled = PageSwapCache(page);
+       if (unlikely(!mapping)) {
+               int shmem_swizzled;
+
+               /*
+                * Page lock is required to identify which special case above
+                * applies. If this is really a shmem page then the page lock
+                * will prevent unexpected transitions.
+                */
+               lock_page(page);
+               shmem_swizzled = PageSwapCache(page) || page->mapping;
                 unlock_page(page);
                 put_page(page);
+
                 if (shmem_swizzled)
                         goto again;
+
                 return -EFAULT;
         }
  
         /*
          * Private mappings are handled in a simple way.
          *
+        * If the futex key is stored on an anonymous page, then the associated
+        * object is the mm which is implicitly pinned by the calling process.
+        *
          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
          * it's a read-only handle, it's expected that futexes attach to
          * the object not the particular process.
@@ -566,16 +591,74 @@ again:
                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                 key->private.mm = mm;
                 key->private.address = address;
+
+               get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
         } else {
+               struct inode *inode;
+
+               /*
+                * The associated futex object in this case is the inode and
+                * the page->mapping must be traversed. Ordinarily this should
+                * be stabilised under page lock but it's not strictly
+                * necessary in this case as we just want to pin the inode, not
+                * update the radix tree or anything like that.
+                *
+                * The RCU read lock is taken as the inode is finally freed
+                * under RCU. If the mapping still matches expectations then the
+                * mapping->host can be safely accessed as being a valid inode.
+                */
+               rcu_read_lock();
+
+               if (READ_ONCE(page->mapping) != mapping) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               inode = READ_ONCE(mapping->host);
+               if (!inode) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               /*
+                * Take a reference unless it is about to be freed. Previously
+                * this reference was taken by ihold under the page lock
+                * pinning the inode in place so i_lock was unnecessary. The
+                * only way for this check to fail is if the inode was
+                * truncated in parallel so warn for now if this happens.
+                *
+                * We are not calling into get_futex_key_refs() in file-backed
+                * cases, therefore a successful atomic_inc return below will
+                * guarantee that get_futex_key() will still imply smp_mb(); (B).
+                */
+               if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               /* Should be impossible but lets be paranoid for now */
+               if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+                       err = -EFAULT;
+                       rcu_read_unlock();
+                       iput(inode);
+
+                       goto out;
+               }
+
                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = mapping->host;
+               key->shared.inode = inode;
                 key->shared.pgoff = basepage_index(page);
+               rcu_read_unlock();
         }
  
-       get_futex_key_refs(key); /* implies MB (B) */
-
  out:
-       unlock_page(page);
         put_page(page);
         return err;
  }
@@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
  
         q->lock_ptr = &hb->lock;
  
-       spin_lock(&hb->lock); /* implies MB (A) */
+       spin_lock(&hb->lock); /* implies smp_mb(); (A) */
         return hb;
  }
  
@@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
  
         /* In the common case we don't take the spinlock, which is nice. */
  retry:
-       lock_ptr = q->lock_ptr;
-       barrier();
+       /*
+        * q->lock_ptr can change between this read and the following spin_lock.
+        * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+        * optimizing lock_ptr out of the logic below.
+        */
+       lock_ptr = READ_ONCE(q->lock_ptr);
         if (lock_ptr != NULL) {
                 spin_lock(lock_ptr);
                 /*
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c

index 8dc65914486999f43caa9276dde914663919fbc6..8d34308ea449ad31bae472f0403c5f1b25324683 100644 (file)
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -66,13 +66,15 @@ struct resource crashk_res = {
         .name  = "Crash kernel",
         .start = 0,
         .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+       .desc  = IORES_DESC_CRASH_KERNEL
  };
  struct resource crashk_low_res = {
         .name  = "Crash kernel",
         .start = 0,
         .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+       .desc  = IORES_DESC_CRASH_KERNEL
  };
  
  int kexec_should_crash(struct task_struct *p)
@@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size)
  
         ram_res->start = end;
         ram_res->end = crashk_res.end;
-       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
         ram_res->name = "System RAM";
  
         crashk_res.end = end - 1;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c

index 007b791f676d5605fb674b6db333d9dd1d0a23b4..56b18eb1f0013c9bc7e041a3dd2a8ecaeb580b95 100644 (file)
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -524,10 +524,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
  
         /* Walk the RAM ranges and allocate a suitable range for the buffer */
         if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res("Crash kernel",
-                                    IORESOURCE_MEM | IORESOURCE_BUSY,
-                                    crashk_res.start, crashk_res.end, kbuf,
-                                    locate_mem_hole_callback);
+               ret = walk_iomem_res_desc(crashk_res.desc,
+                               IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+                               crashk_res.start, crashk_res.end, kbuf,
+                               locate_mem_hole_callback);
         else
                 ret = walk_system_ram_res(0, -1, kbuf,
                                           locate_mem_hole_callback);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c

index a02812743a7e63378a79cc768255f807a7fd469b..b5c30d9f46c5084acddbd34e533f91e9c98e8300 100644 (file)
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
   * of times)
   */
  
-#include <linux/latencytop.h>
  #include <linux/kallsyms.h>
  #include <linux/seq_file.h>
  #include <linux/notifier.h>
  #include <linux/spinlock.h>
  #include <linux/proc_fs.h>
+#include <linux/latencytop.h>
  #include <linux/export.h>
  #include <linux/sched.h>
  #include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
         proc_create("latency_stats", 0644, NULL, &lstats_fops);
         return 0;
  }
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int err;
+
+       err = proc_dointvec(table, write, buffer, lenp, ppos);
+       if (latencytop_enabled)
+               force_schedstat_enabled();
+
+       return err;
+}
  device_initcall(init_lstats_procfs);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c

index 716547fdb8731dcc395d70f2664ec6fb22378db6..f894a2cd9b2aa3ad37adfad0d1c15f46fdf93885 100644 (file)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
         return ret;
  }
  
-static int lockdep_initialized;
-
  unsigned long nr_list_entries;
  static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
  
@@ -433,19 +431,6 @@ unsigned int nr_process_chains;
  unsigned int max_lockdep_depth;
  
  #ifdef CONFIG_DEBUG_LOCKDEP
-/*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
-       .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
-       .entries = lockdep_init_trace_data,
-};
-
  /*
   * Various lockdep statistics:
   */
@@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         struct hlist_head *hash_head;
         struct lock_class *class;
  
-#ifdef CONFIG_DEBUG_LOCKDEP
-       /*
-        * If the architecture calls into lockdep before initializing
-        * the hashes then we'll warn about it later. (we cannot printk
-        * right now)
-        */
-       if (unlikely(!lockdep_initialized)) {
-               lockdep_init();
-               lockdep_init_error = 1;
-               lock_init_error = lock->name;
-               save_stack_trace(&lockdep_init_trace);
-       }
-#endif
-
         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
                 debug_locks_off();
                 printk(KERN_ERR
@@ -2010,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
         return lock_classes + chain_hlocks[chain->base + i];
  }
  
+/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+                                       struct held_lock *hlock)
+{
+       int i;
+       struct held_lock *hlock_curr;
+
+       for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+               hlock_curr = curr->held_locks + i;
+               if (hlock_curr->irq_context != hlock->irq_context)
+                       break;
+
+       }
+
+       return ++i;
+}
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+                       struct held_lock *hlock,
+                       struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+       int i, j, id;
+
+       i = get_first_held_lock(curr, hlock);
+
+       if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+               return 0;
+
+       for (j = 0; j < chain->depth - 1; j++, i++) {
+               id = curr->held_locks[i].class_idx - 1;
+
+               if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+                       return 0;
+       }
+#endif
+       return 1;
+}
+
  /*
   * Look up a dependency chain. If the key is not present yet then
   * add it and return 1 - in this case the new dependency chain is
@@ -2023,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         struct lock_class *class = hlock_class(hlock);
         struct hlist_head *hash_head = chainhashentry(chain_key);
         struct lock_chain *chain;
-       struct held_lock *hlock_curr;
         int i, j;
  
         /*
@@ -2041,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr,
                 if (chain->chain_key == chain_key) {
  cache_hit:
                         debug_atomic_inc(chain_lookup_hits);
+                       if (!check_no_collision(curr, hlock, chain))
+                               return 0;
+
                         if (very_verbose(class))
                                 printk("\nhash chain already cached, key: "
                                         "%016Lx tail class: [%p] %s\n",
@@ -2078,13 +2098,7 @@ cache_hit:
         chain = lock_chains + nr_lock_chains++;
         chain->chain_key = chain_key;
         chain->irq_context = hlock->irq_context;
-       /* Find the first held_lock of current chain */
-       for (i = curr->lockdep_depth - 1; i >= 0; i--) {
-               hlock_curr = curr->held_locks + i;
-               if (hlock_curr->irq_context != hlock->irq_context)
-                       break;
-       }
-       i++;
+       i = get_first_held_lock(curr, hlock);
         chain->depth = curr->lockdep_depth + 1 - i;
         if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                 chain->base = nr_chain_hlocks;
@@ -2172,7 +2186,7 @@ static void check_chain_key(struct task_struct *curr)
  {
  #ifdef CONFIG_DEBUG_LOCKDEP
         struct held_lock *hlock, *prev_hlock = NULL;
-       unsigned int i, id;
+       unsigned int i;
         u64 chain_key = 0;
  
         for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2189,17 +2203,16 @@ static void check_chain_key(struct task_struct *curr)
                                 (unsigned long long)hlock->prev_chain_key);
                         return;
                 }
-               id = hlock->class_idx - 1;
                 /*
                  * Whoops ran out of static storage again?
                  */
-               if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+               if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
                         return;
  
                 if (prev_hlock && (prev_hlock->irq_context !=
                                                         hlock->irq_context))
                         chain_key = 0;
-               chain_key = iterate_chain_key(chain_key, id);
+               chain_key = iterate_chain_key(chain_key, hlock->class_idx);
                 prev_hlock = hlock;
         }
         if (chain_key != curr->curr_chain_key) {
@@ -3077,7 +3090,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         struct task_struct *curr = current;
         struct lock_class *class = NULL;
         struct held_lock *hlock;
-       unsigned int depth, id;
+       unsigned int depth;
         int chain_head = 0;
         int class_idx;
         u64 chain_key;
@@ -3180,11 +3193,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
          * The 'key ID' is what is the most compact key value to drive
          * the hash, not class->key.
          */
-       id = class - lock_classes;
         /*
          * Whoops, we did it again.. ran straight out of our static allocation.
          */
-       if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+       if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
                 return 0;
  
         chain_key = curr->curr_chain_key;
@@ -3202,7 +3214,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                 chain_key = 0;
                 chain_head = 1;
         }
-       chain_key = iterate_chain_key(chain_key, id);
+       chain_key = iterate_chain_key(chain_key, class_idx);
  
         if (nest_lock && !__lock_is_held(nest_lock))
                 return print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -4013,28 +4025,6 @@ out_restore:
         raw_local_irq_restore(flags);
  }
  
-void lockdep_init(void)
-{
-       int i;
-
-       /*
-        * Some architectures have their own start_kernel()
-        * code which calls lockdep_init(), while we also
-        * call lockdep_init() from the start_kernel() itself,
-        * and we want to initialize the hashes only once:
-        */
-       if (lockdep_initialized)
-               return;
-
-       for (i = 0; i < CLASSHASH_SIZE; i++)
-               INIT_HLIST_HEAD(classhash_table + i);
-
-       for (i = 0; i < CHAINHASH_SIZE; i++)
-               INIT_HLIST_HEAD(chainhash_table + i);
-
-       lockdep_initialized = 1;
-}
-
  void __init lockdep_info(void)
  {
         printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4051,6 @@ void __init lockdep_info(void)
  
         printk(" per task-struct memory footprint: %lu bytes\n",
                 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-       if (lockdep_init_error) {
-               printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-               printk("Call stack leading to lockdep invocation was:\n");
-               print_stack_trace(&lockdep_init_trace, 0);
-       }
-#endif
  }
  
  static void
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h

index 5b9102a47ea5209dd887b6dfbca16215b2bad922..c835270f0c2f93c24f623e9ea2ba2e94eebb006a 100644 (file)
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
         node->locked = 0;
         node->next   = NULL;
  
-       prev = xchg_acquire(lock, node);
+       /*
+        * We rely on the full barrier with global transitivity implied by the
+        * below xchg() to order the initialization stores above against any
+        * observation of @node. And to provide the ACQUIRE ordering associated
+        * with a LOCK primitive.
+        */
+       prev = xchg(lock, node);
         if (likely(prev == NULL)) {
                 /*
                  * Lock acquired, don't need to set node->locked to 1. Threads
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c

index 0551c219c40e5bb8f8f87bfb0445100fe1b31cc9..e364b424b019ff51de0bde17fe2611444b5c1efb 100644 (file)
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -716,6 +716,7 @@ static inline void
  __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
  {
         unsigned long flags;
+       WAKE_Q(wake_q);
  
         /*
          * As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
                                            struct mutex_waiter, list);
  
                 debug_mutex_wake_waiter(lock, waiter);
-
-               wake_up_process(waiter->task);
+               wake_q_add(&wake_q, waiter->task);
         }
  
         spin_unlock_mutex(&lock->wait_lock, flags);
+       wake_up_q(&wake_q);
  }
  
  /*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c

index 393d1874b9e0c81d44ef9d0447ded22a03bccd81..ce2f75e32ae155b30cfe324c56f9bc526771b66a 100644 (file)
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
          * sequentiality; this is because not all clear_pending_set_locked()
          * implementations imply full barriers.
          */
-       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
-               cpu_relax();
+       smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
  
         /*
          * take ownership and clear the pending bit.
@@ -435,7 +434,7 @@ queue:
          *
          * The PV pv_wait_head_or_lock function, if active, will acquire
          * the lock and return a non-zero value. So we have to skip the
-        * smp_load_acquire() call. As the next PV queue head hasn't been
+        * smp_cond_acquire() call. As the next PV queue head hasn't been
          * designated yet, there is no way for the locked value to become
          * _Q_SLOW_VAL. So both the set_locked() and the
          * atomic_cmpxchg_relaxed() calls will be safe.
@@ -466,7 +465,7 @@ locked:
                         break;
                 }
                 /*
-                * The smp_load_acquire() call above has provided the necessary
+                * The smp_cond_acquire() call above has provided the necessary
                  * acquire semantics required for locking. At most two
                  * iterations of this loop may be ran.
                  */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h

index 87bb235c3448054d63923d0f9549cbc7718a49ca..21ede57f68b320594f874e4ff0a1b60974fc67f0 100644 (file)
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -54,6 +54,11 @@ struct pv_node {
         u8                      state;
  };
  
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
  /*
   * By replacing the regular queued_spin_trylock() with the function below,
   * it will be called once when a lock waiter enter the PV slowpath before
@@ -65,9 +70,11 @@ struct pv_node {
  static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
  {
         struct __qspinlock *l = (void *)lock;
+       int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+                  (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
  
-       return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-               (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+       qstat_inc(qstat_pv_lock_stealing, ret);
+       return ret;
  }
  
  /*
@@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
  }
  #endif /* _Q_PENDING_BITS == 8 */
  
-/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
  /*
   * Lock and MCS node addresses hash table for fast lookup
   *
@@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
         if (READ_ONCE(pn->state) == vcpu_hashed)
                 lp = (struct qspinlock **)1;
  
+       /*
+        * Tracking # of slowpath locking operations
+        */
+       qstat_inc(qstat_pv_lock_slowpath, true);
+
         for (;; waitcnt++) {
                 /*
                  * Set correct vCPU state to be used by queue node wait-early
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h

index 640dcecdd1df7a5ac5c8167fbd9e70e71b2e8f21..eb2a2c9bc3fc15d181c9e5981648974dc7aec65c 100644 (file)
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
   *   pv_kick_wake      - # of vCPU kicks used for computing pv_latency_wake
   *   pv_latency_kick   - average latency (ns) of vCPU kick operation
   *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_slowpath  - # of locking operations via the slowpath
   *   pv_lock_stealing  - # of lock stealing operations
   *   pv_spurious_wakeup        - # of spurious wakeups
   *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
@@ -45,6 +46,7 @@ enum qlock_stats {
         qstat_pv_kick_wake,
         qstat_pv_latency_kick,
         qstat_pv_latency_wake,
+       qstat_pv_lock_slowpath,
         qstat_pv_lock_stealing,
         qstat_pv_spurious_wakeup,
         qstat_pv_wait_again,
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
         [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
         [qstat_pv_latency_kick]    = "pv_latency_kick",
         [qstat_pv_latency_wake]    = "pv_latency_wake",
+       [qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
         [qstat_pv_lock_stealing]   = "pv_lock_stealing",
         [qstat_pv_wait_again]      = "pv_wait_again",
         [qstat_pv_wait_early]      = "pv_wait_early",
@@ -279,19 +282,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
  #define pv_kick(c)     __pv_kick(c)
  #define pv_wait(p, v)  __pv_wait(p, v)
  
-/*
- * PV unfair trylock count tracking function
- */
-static inline int qstat_spin_steal_lock(struct qspinlock *lock)
-{
-       int ret = pv_queued_spin_steal_lock(lock);
-
-       qstat_inc(qstat_pv_lock_stealing, ret);
-       return ret;
-}
-#undef  queued_spin_trylock
-#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
-
  #else /* CONFIG_QUEUED_LOCK_STAT */
  
  static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
diff --git a/kernel/memremap.c b/kernel/memremap.c

index b981a7b023f04c356df5b852f60caeae232fdf45..fb9b88787ebc269fa39a3c08a19c8d0940245bbe 100644 (file)
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
  
  static void *try_ram_remap(resource_size_t offset, size_t size)
  {
-       struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+       unsigned long pfn = PHYS_PFN(offset);
  
         /* In the simple case just return the existing linear address */
-       if (!PageHighMem(page))
+       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
                 return __va(offset);
         return NULL; /* fallback to ioremap_cache */
  }
@@ -47,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
   * being mapped does not have i/o side effects and the __iomem
   * annotation is not applicable.
   *
- * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * MEMREMAP_WB - matches the default mapping for System RAM on
   * the architecture.  This is usually a read-allocate write-back cache.
   * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
   * memremap() will bypass establishing a new mapping and instead return
@@ -56,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
   * MEMREMAP_WT - establish a mapping whereby writes either bypass the
   * cache or are written through to memory and never exist in a
   * cache-dirty state with respect to program visibility.  Attempts to
- * map "System RAM" with this mapping type will fail.
+ * map System RAM with this mapping type will fail.
   */
  void *memremap(resource_size_t offset, size_t size, unsigned long flags)
  {
-       int is_ram = region_intersects(offset, size, "System RAM");
+       int is_ram = region_intersects(offset, size,
+                                      IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
         void *addr = NULL;
  
         if (is_ram == REGION_MIXED) {
@@ -76,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                  * MEMREMAP_WB is special in that it can be satisifed
                  * from the direct map.  Some archs depend on the
                  * capability of memremap() to autodetect cases where
-                * the requested range is potentially in "System RAM"
+                * the requested range is potentially in System RAM.
                  */
                 if (is_ram == REGION_INTERSECTS)
                         addr = try_ram_remap(offset, size);
@@ -88,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
          * If we don't have a mapping yet and more request flags are
          * pending then we will be attempting to establish a new virtual
          * address mapping.  Enforce that this mapping is not aliasing
-        * "System RAM"
+        * System RAM.
          */
         if (!addr && is_ram == REGION_INTERSECTS && flags) {
                 WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
@@ -270,13 +271,17 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
  void *devm_memremap_pages(struct device *dev, struct resource *res,
                 struct percpu_ref *ref, struct vmem_altmap *altmap)
  {
-       int is_ram = region_intersects(res->start, resource_size(res),
-                       "System RAM");
         resource_size_t key, align_start, align_size, align_end;
         struct dev_pagemap *pgmap;
         struct page_map *page_map;
+       int error, nid, is_ram;
         unsigned long pfn;
-       int error, nid;
+
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+               - align_start;
+       is_ram = region_intersects(align_start, align_size,
+               IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
  
         if (is_ram == REGION_MIXED) {
                 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -314,8 +319,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
  
         mutex_lock(&pgmap_lock);
         error = 0;
-       align_start = res->start & ~(SECTION_SIZE - 1);
-       align_size = ALIGN(resource_size(res), SECTION_SIZE);
         align_end = align_start + align_size - 1;
         for (key = align_start; key <= align_end; key += SECTION_SIZE) {
                 struct dev_pagemap *dup;
@@ -351,8 +354,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
         for_each_device_pfn(pfn, page_map) {
                 struct page *page = pfn_to_page(pfn);
  
-               /* ZONE_DEVICE pages must never appear on a slab lru */
-               list_force_poison(&page->lru);
+               /*
+                * ZONE_DEVICE pages union ->lru with a ->pgmap back
+                * pointer.  It is a bug if a ZONE_DEVICE page is ever
+                * freed or placed on a driver-private list.  Seed the
+                * storage with LIST_POISON* values.
+                */
+               list_del(&page->lru);
                 page->pgmap = pgmap;
         }
         devres_add(dev, page_map);
diff --git a/kernel/profile.c b/kernel/profile.c

index 99513e1160e518d322f6d0ce0f346e6da9fcbbf0..51369697466e36ba52af710863376d4847d43e36 100644 (file)
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
  
         if (!strncmp(str, sleepstr, strlen(sleepstr))) {
  #ifdef CONFIG_SCHEDSTATS
+               force_schedstat_enabled();
                 prof_on = SLEEP_PROFILING;
                 if (str[strlen(sleepstr)] == ',')
                         str += strlen(sleepstr) + 1;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index e41dd4131f7a141976e771653e3169f2955f6f33..9fd5b628a88dc0cd1e0790d20ecf212940bd5407 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
         int needmore;
         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  
-       rcu_nocb_gp_cleanup(rsp, rnp);
         rnp->need_future_gp[c & 0x1] = 0;
         needmore = rnp->need_future_gp[(c + 1) & 0x1];
         trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
             !READ_ONCE(rsp->gp_flags) ||
             !rsp->gp_kthread)
                 return;
-       wake_up(&rsp->gp_wq);
+       swake_up(&rsp->gp_wq);
  }
  
  /*
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         int nocb = 0;
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
+       struct swait_queue_head *sq;
  
         WRITE_ONCE(rsp->gp_activity, jiffies);
         raw_spin_lock_irq_rcu_node(rnp);
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
                 /* smp_mb() provided by prior unlock-lock pair. */
                 nocb += rcu_future_gp_cleanup(rsp, rnp);
+               sq = rcu_nocb_gp_get(rnp);
                 raw_spin_unlock_irq(&rnp->lock);
+               rcu_nocb_gp_cleanup(sq);
                 cond_resched_rcu_qs();
                 WRITE_ONCE(rsp->gp_activity, jiffies);
                 rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                READ_ONCE(rsp->gpnum),
                                                TPS("reqwait"));
                         rsp->gp_state = RCU_GP_WAIT_GPS;
-                       wait_event_interruptible(rsp->gp_wq,
+                       swait_event_interruptible(rsp->gp_wq,
                                                  READ_ONCE(rsp->gp_flags) &
                                                  RCU_GP_FLAG_INIT);
                         rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                READ_ONCE(rsp->gpnum),
                                                TPS("fqswait"));
                         rsp->gp_state = RCU_GP_WAIT_FQS;
-                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
+                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
                         rsp->gp_state = RCU_GP_DOING_FQS;
                         /* Locking provides needed memory barriers. */
@@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-       rcu_gp_kthread_wake(rsp);
+       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
  }
  
  /*
@@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
         }
         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-       rcu_gp_kthread_wake(rsp);
+       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
  }
  
  /*
@@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
                         if (wake) {
                                 smp_mb(); /* EGP done before wake_up(). */
-                               wake_up(&rsp->expedited_wq);
+                               swake_up(&rsp->expedited_wq);
                         }
                         break;
                 }
@@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
         jiffies_start = jiffies;
  
         for (;;) {
-               ret = wait_event_interruptible_timeout(
+               ret = swait_event_timeout(
                                 rsp->expedited_wq,
                                 sync_rcu_preempt_exp_done(rnp_root),
                                 jiffies_stall);
@@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
                         return;
                 if (ret < 0) {
                         /* Hit a signal, disable CPU stall warnings. */
-                       wait_event(rsp->expedited_wq,
+                       swait_event(rsp->expedited_wq,
                                    sync_rcu_preempt_exp_done(rnp_root));
                         return;
                 }
@@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                 }
         }
  
-       init_waitqueue_head(&rsp->gp_wq);
-       init_waitqueue_head(&rsp->expedited_wq);
+       init_swait_queue_head(&rsp->gp_wq);
+       init_swait_queue_head(&rsp->expedited_wq);
         rnp = rsp->level[rcu_num_lvls - 1];
         for_each_possible_cpu(i) {
                 while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index 83360b4f4352786b0129d5ac64046102d73933db..bbd235d0e71f7144b3548c75e33db3ddc97f84a6 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
  #include <linux/threads.h>
  #include <linux/cpumask.h>
  #include <linux/seqlock.h>
+#include <linux/swait.h>
  #include <linux/stop_machine.h>
  
  /*
@@ -243,7 +244,7 @@ struct rcu_node {
                                 /* Refused to boost: not sure why, though. */
                                 /*  This can happen due to race conditions. */
  #ifdef CONFIG_RCU_NOCB_CPU
-       wait_queue_head_t nocb_gp_wq[2];
+       struct swait_queue_head nocb_gp_wq[2];
                                 /* Place for rcu_nocb_kthread() to wait GP. */
  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
         int need_future_gp[2];
@@ -399,7 +400,7 @@ struct rcu_data {
         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
         struct rcu_head **nocb_follower_tail;
-       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
+       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
         struct task_struct *nocb_kthread;
         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
  
@@ -478,7 +479,7 @@ struct rcu_state {
         unsigned long gpnum;                    /* Current gp number. */
         unsigned long completed;                /* # of last completed gp. */
         struct task_struct *gp_kthread;         /* Task for grace periods. */
-       wait_queue_head_t gp_wq;                /* Where GP task waits. */
+       struct swait_queue_head gp_wq;          /* Where GP task waits. */
         short gp_flags;                         /* Commands for GP task. */
         short gp_state;                         /* GP kthread sleep state. */
  
@@ -506,7 +507,7 @@ struct rcu_state {
         unsigned long expedited_sequence;       /* Take a ticket. */
         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
-       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
+       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
         int ncpus_snap;                         /* # CPUs seen last time. */
  
         unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
  static void increment_cpu_stall_ticks(void);
  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
  static void rcu_init_one_nocb(struct rcu_node *rnp);
  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                             bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h

index 9467a8b7e756173bc9a94cc5b9828066a9f50e16..080bd202d360faa62f1a5fc63654239b5cd80144 100644 (file)
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
   * grace period.
   */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
  {
-       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+       swake_up_all(sq);
  }
  
  /*
@@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
  }
  
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
  static void rcu_init_one_nocb(struct rcu_node *rnp)
  {
-       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
  }
  
  #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
-               wake_up(&rdp_leader->nocb_wq);
+               swake_up(&rdp_leader->nocb_wq);
         }
  }
  
@@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
          */
         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
         for (;;) {
-               wait_event_interruptible(
+               swait_event_interruptible(
                         rnp->nocb_gp_wq[c & 0x1],
                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
                 if (likely(d))
@@ -2097,7 +2102,7 @@ wait_again:
         /* Wait for callbacks to appear. */
         if (!rcu_nocb_poll) {
                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-               wait_event_interruptible(my_rdp->nocb_wq,
+               swait_event_interruptible(my_rdp->nocb_wq,
                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
                 /* Memory barrier handled by smp_mb() calls below and repoll. */
         } else if (firsttime) {
@@ -2172,7 +2177,7 @@ wait_again:
                          * List was empty, wake up the follower.
                          * Memory barriers supplied by atomic_long_add().
                          */
-                       wake_up(&rdp->nocb_wq);
+                       swake_up(&rdp->nocb_wq);
                 }
         }
  
@@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
                 if (!rcu_nocb_poll) {
                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                             "FollowerSleep");
-                       wait_event_interruptible(rdp->nocb_wq,
+                       swait_event_interruptible(rdp->nocb_wq,
                                                  READ_ONCE(rdp->nocb_follower_head));
                 } else if (firsttime) {
                         /* Don't drown trace log with "Poll"! */
@@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void)
  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
  {
         rdp->nocb_tail = &rdp->nocb_head;
-       init_waitqueue_head(&rdp->nocb_wq);
+       init_swait_queue_head(&rdp->nocb_wq);
         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
  }
  
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
         return false;
  }
  
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
  {
  }
  
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
  {
  }
  
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+       return NULL;
+}
+
  static void rcu_init_one_nocb(struct rcu_node *rnp)
  {
  }
diff --git a/kernel/resource.c b/kernel/resource.c

index 3669d1bfc4254213e0e42dacab374624d74cdd5c..4d466052426b3e993e7e3c3550e948308e377c51 100644 (file)
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -333,13 +333,13 @@ int release_resource(struct resource *old)
  EXPORT_SYMBOL(release_resource);
  
  /*
- * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags and "name".
- * If found, returns 0, res is overwritten, if not found, returns -1.
- * This walks through whole tree and not just first level children
- * until and unless first_level_children_only is true.
+ * Finds the lowest iomem resource existing within [res->start.res->end).
+ * The caller must specify res->start, res->end, res->flags, and optionally
+ * desc.  If found, returns 0, res is overwritten, if not found, returns -1.
+ * This function walks the whole tree and not just first level children until
+ * and unless first_level_children_only is true.
   */
-static int find_next_iomem_res(struct resource *res, char *name,
+static int find_next_iomem_res(struct resource *res, unsigned long desc,
                                bool first_level_children_only)
  {
         resource_size_t start, end;
@@ -358,9 +358,9 @@ static int find_next_iomem_res(struct resource *res, char *name,
         read_lock(&resource_lock);
  
         for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
-               if (p->flags != res->flags)
+               if ((p->flags & res->flags) != res->flags)
                         continue;
-               if (name && strcmp(p->name, name))
+               if ((desc != IORES_DESC_NONE) && (desc != p->desc))
                         continue;
                 if (p->start > end) {
                         p = NULL;
@@ -385,15 +385,18 @@ static int find_next_iomem_res(struct resource *res, char *name,
   * Walks through iomem resources and calls func() with matching resource
   * ranges. This walks through whole tree and not just first level children.
   * All the memory ranges which overlap start,end and also match flags and
- * name are valid candidates.
+ * desc are valid candidates.
   *
- * @name: name of resource
- * @flags: resource flags
+ * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
+ * @flags: I/O resource flags
   * @start: start addr
   * @end: end addr
+ *
+ * NOTE: For a new descriptor search, define a new IORES_DESC in
+ * <linux/ioport.h> and set it in 'desc' of a target resource entry.
   */
-int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
-               void *arg, int (*func)(u64, u64, void *))
+int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
+               u64 end, void *arg, int (*func)(u64, u64, void *))
  {
         struct resource res;
         u64 orig_end;
@@ -403,23 +406,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
         res.end = end;
         res.flags = flags;
         orig_end = res.end;
+
         while ((res.start < res.end) &&
-               (!find_next_iomem_res(&res, name, false))) {
+               (!find_next_iomem_res(&res, desc, false))) {
+
                 ret = (*func)(res.start, res.end, arg);
                 if (ret)
                         break;
+
                 res.start = res.end + 1;
                 res.end = orig_end;
         }
+
         return ret;
  }
  
  /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM". This function deals with
- * full ranges and not pfn. If resources are not pfn aligned, dealing
- * with pfn can truncate ranges.
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * Now, this function is only for System RAM, it deals with full ranges and
+ * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
+ * ranges.
   */
  int walk_system_ram_res(u64 start, u64 end, void *arg,
                                 int (*func)(u64, u64, void *))
@@ -430,10 +437,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
  
         res.start = start;
         res.end = end;
-       res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         orig_end = res.end;
         while ((res.start < res.end) &&
-               (!find_next_iomem_res(&res, "System RAM", true))) {
+               (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
                 ret = (*func)(res.start, res.end, arg);
                 if (ret)
                         break;
@@ -446,9 +453,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
  #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
  
  /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM".
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * It is to be used only for System RAM.
   */
  int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                 void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -460,10 +467,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
  
         res.start = (u64) start_pfn << PAGE_SHIFT;
         res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
-       res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         orig_end = res.end;
         while ((res.start < res.end) &&
-               (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
+               (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
                 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                 end_pfn = (res.end + 1) >> PAGE_SHIFT;
                 if (end_pfn > pfn)
@@ -484,7 +491,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
  }
  /*
   * This generic page_is_ram() returns true if specified address is
- * registered as "System RAM" in iomem_resource list.
+ * registered as System RAM in iomem_resource list.
   */
  int __weak page_is_ram(unsigned long pfn)
  {
@@ -496,30 +503,34 @@ EXPORT_SYMBOL_GPL(page_is_ram);
   * region_intersects() - determine intersection of region with known resources
   * @start: region start address
   * @size: size of region
- * @name: name of resource (in iomem_resource)
+ * @flags: flags of resource (in iomem_resource)
+ * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
   *
   * Check if the specified region partially overlaps or fully eclipses a
- * resource identified by @name.  Return REGION_DISJOINT if the region
- * does not overlap @name, return REGION_MIXED if the region overlaps
- * @type and another resource, and return REGION_INTERSECTS if the
- * region overlaps @type and no other defined resource. Note, that
- * REGION_INTERSECTS is also returned in the case when the specified
- * region overlaps RAM and undefined memory holes.
+ * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
+ * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
+ * return REGION_MIXED if the region overlaps @flags/@desc and another
+ * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
+ * and no other defined resource. Note that REGION_INTERSECTS is also
+ * returned in the case when the specified region overlaps RAM and undefined
+ * memory holes.
   *
   * region_intersect() is used by memory remapping functions to ensure
   * the user is not remapping RAM and is a vast speed up over walking
   * through the resource table page by page.
   */
-int region_intersects(resource_size_t start, size_t size, const char *name)
+int region_intersects(resource_size_t start, size_t size, unsigned long flags,
+                     unsigned long desc)
  {
-       unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
         resource_size_t end = start + size - 1;
         int type = 0; int other = 0;
         struct resource *p;
  
         read_lock(&resource_lock);
         for (p = iomem_resource.child; p ; p = p->sibling) {
-               bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+               bool is_type = (((p->flags & flags) == flags) &&
+                               ((desc == IORES_DESC_NONE) ||
+                                (desc == p->desc)));
  
                 if (start >= p->start && start <= p->end)
                         is_type ? type++ : other++;
@@ -538,6 +549,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
  
         return REGION_DISJOINT;
  }
+EXPORT_SYMBOL_GPL(region_intersects);
  
  void __weak arch_remove_reservations(struct resource *avail)
  {
@@ -948,6 +960,7 @@ static void __init __reserve_region_with_split(struct resource *root,
         res->start = start;
         res->end = end;
         res->flags = IORESOURCE_BUSY;
+       res->desc = IORES_DESC_NONE;
  
         while (1) {
  
@@ -982,6 +995,7 @@ static void __init __reserve_region_with_split(struct resource *root,
                                 next_res->start = conflict->end + 1;
                                 next_res->end = end;
                                 next_res->flags = IORESOURCE_BUSY;
+                               next_res->desc = IORES_DESC_NONE;
                         }
                 } else {
                         res->start = conflict->end + 1;
@@ -1071,8 +1085,9 @@ struct resource * __request_region(struct resource *parent,
         res->name = name;
         res->start = start;
         res->end = start + n - 1;
-       res->flags = resource_type(parent);
+       res->flags = resource_type(parent) | resource_ext_type(parent);
         res->flags |= IORESOURCE_BUSY | flags;
+       res->desc = IORES_DESC_NONE;
  
         write_lock(&resource_lock);
  
@@ -1238,6 +1253,7 @@ int release_mem_region_adjustable(struct resource *parent,
                         new_res->start = end + 1;
                         new_res->end = res->end;
                         new_res->flags = res->flags;
+                       new_res->desc = res->desc;
                         new_res->parent = res->parent;
                         new_res->sibling = res->sibling;
                         new_res->child = NULL;
@@ -1413,6 +1429,7 @@ static int __init reserve_setup(char *str)
                         res->start = io_start;
                         res->end = io_start + io_num - 1;
                         res->flags = IORESOURCE_BUSY;
+                       res->desc = IORES_DESC_NONE;
                         res->child = NULL;
                         if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
                                 reserved = x+1;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index 67687973ce80d63d3f52698fb4b738b76964b896..7d4cba227cbd393c3515516926bfeb035b222e0c 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
  
  obj-y += core.o loadavg.o clock.o cputime.o
  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c

index bc54e84675da0d50bd9a60a82f2da5df8590b080..fedb967a98419c14348e8d79becdcb7abf62bfa2 100644 (file)
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -61,6 +61,7 @@
  #include <linux/static_key.h>
  #include <linux/workqueue.h>
  #include <linux/compiler.h>
+#include <linux/tick.h>
  
  /*
   * Scheduler clock - returns current time in nanosec units.
@@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)
  {
         if (!sched_clock_stable())
                 static_key_slow_inc(&__sched_clock_stable);
+
+       tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
  }
  
  void set_sched_clock_stable(void)
@@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)
         /* XXX worry about clock continuity */
         if (sched_clock_stable())
                 static_key_slow_dec(&__sched_clock_stable);
+
+       tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
  }
  
  static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 9503d590e5ef5b81537947b9925ecfe689f72a91..e5725b931bee953fb8bcf0e0ae74dea38c404537 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
   *              Thomas Gleixner, Mike Kravetz
   */
  
+#include <linux/kasan.h>
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/nmi.h>
@@ -66,12 +67,10 @@
  #include <linux/pagemap.h>
  #include <linux/hrtimer.h>
  #include <linux/tick.h>
-#include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
  #include <linux/slab.h>
  #include <linux/init_task.h>
-#include <linux/binfmts.h>
  #include <linux/context_tracking.h>
  #include <linux/compiler.h>
  
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
  
  #undef SCHED_FEAT
  
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)      \
-       #name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-       int i;
-
-       for (i = 0; i < __SCHED_FEAT_NR; i++) {
-               if (!(sysctl_sched_features & (1UL << i)))
-                       seq_puts(m, "NO_");
-               seq_printf(m, "%s ", sched_feat_names[i]);
-       }
-       seq_puts(m, "\n");
-
-       return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled)      \
-       jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
-       static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
-       static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
-       int i;
-       int neg = 0;
-
-       if (strncmp(cmp, "NO_", 3) == 0) {
-               neg = 1;
-               cmp += 3;
-       }
-
-       for (i = 0; i < __SCHED_FEAT_NR; i++) {
-               if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                       if (neg) {
-                               sysctl_sched_features &= ~(1UL << i);
-                               sched_feat_disable(i);
-                       } else {
-                               sysctl_sched_features |= (1UL << i);
-                               sched_feat_enable(i);
-                       }
-                       break;
-               }
-       }
-
-       return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-       char buf[64];
-       char *cmp;
-       int i;
-       struct inode *inode;
-
-       if (cnt > 63)
-               cnt = 63;
-
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-
-       buf[cnt] = 0;
-       cmp = strstrip(buf);
-
-       /* Ensure the static_key remains in a consistent state */
-       inode = file_inode(filp);
-       inode_lock(inode);
-       i = sched_feat_set(cmp);
-       inode_unlock(inode);
-       if (i == __SCHED_FEAT_NR)
-               return -EINVAL;
-
-       *ppos += cnt;
-
-       return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-       return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-       .open           = sched_feat_open,
-       .write          = sched_feat_write,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-       debugfs_create_file("sched_features", 0644, NULL, NULL,
-                       &sched_feat_fops);
-
-       return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
  /*
   * Number of tasks to iterate in a single balance run.
   * Limited because this is done with IRQs disabled.
@@ -453,20 +320,6 @@ static inline void init_hrtick(void)
  }
  #endif /* CONFIG_SCHED_HRTICK */
  
-/*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#define fetch_or(ptr, val)                                             \
-({     typeof(*(ptr)) __old, __val = *(ptr);                           \
-       for (;;) {                                                      \
-               __old = cmpxchg((ptr), __val, __val | (val));           \
-               if (__old == __val)                                     \
-                       break;                                          \
-               __val = __old;                                          \
-       }                                                               \
-       __old;                                                          \
-})
-
  #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
  /*
   * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -715,31 +568,36 @@ static inline bool got_nohz_idle_kick(void)
  #endif /* CONFIG_NO_HZ_COMMON */
  
  #ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+bool sched_can_stop_tick(struct rq *rq)
  {
+       int fifo_nr_running;
+
+       /* Deadline tasks, even if single, need the tick */
+       if (rq->dl.dl_nr_running)
+               return false;
+
         /*
-        * FIFO realtime policy runs the highest priority task. Other runnable
-        * tasks are of a lower priority. The scheduler tick does nothing.
+        * FIFO realtime policy runs the highest priority task (after DEADLINE).
+        * Other runnable tasks are of a lower priority. The scheduler tick
+        * isn't needed.
          */
-       if (current->policy == SCHED_FIFO)
+       fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+       if (fifo_nr_running)
                 return true;
  
         /*
          * Round-robin realtime tasks time slice with other tasks at the same
-        * realtime priority. Is this task the only one at this priority?
+        * realtime priority.
          */
-       if (current->policy == SCHED_RR) {
-               struct sched_rt_entity *rt_se = &current->rt;
-
-               return list_is_singular(&rt_se->run_list);
+       if (rq->rt.rr_nr_running) {
+               if (rq->rt.rr_nr_running == 1)
+                       return true;
+               else
+                       return false;
         }
  
-       /*
-        * More than one running task need preemption.
-        * nr_running update is assumed to be visible
-        * after IPI is sent from wakers.
-        */
-       if (this_rq()->nr_running > 1)
+       /* Normal multitasking need periodic preemption checks */
+       if (rq->cfs.nr_running > 1)
                 return false;
  
         return true;
@@ -2093,7 +1951,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
         ttwu_queue(p, cpu);
  stat:
-       ttwu_stat(p, cpu, wake_flags);
+       if (schedstat_enabled())
+               ttwu_stat(p, cpu, wake_flags);
  out:
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
@@ -2141,7 +2000,8 @@ static void try_to_wake_up_local(struct task_struct *p)
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
         ttwu_do_wakeup(rq, p, 0);
-       ttwu_stat(p, smp_processor_id(), 0);
+       if (schedstat_enabled())
+               ttwu_stat(p, smp_processor_id(), 0);
  out:
         raw_spin_unlock(&p->pi_lock);
  }
@@ -2183,7 +2043,6 @@ void __dl_clear_params(struct task_struct *p)
         dl_se->dl_bw = 0;
  
         dl_se->dl_throttled = 0;
-       dl_se->dl_new = 1;
         dl_se->dl_yielded = 0;
  }
  
@@ -2210,6 +2069,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  #endif
  
  #ifdef CONFIG_SCHEDSTATS
+       /* Even if schedstat is disabled, there should not be garbage */
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
  
@@ -2218,6 +2078,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         __dl_clear_params(p);
  
         INIT_LIST_HEAD(&p->rt.run_list);
+       p->rt.timeout           = 0;
+       p->rt.time_slice        = sched_rr_timeslice;
+       p->rt.on_rq             = 0;
+       p->rt.on_list           = 0;
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2281,6 +2145,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
  #endif
  #endif
  
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+       if (enabled)
+               static_branch_enable(&sched_schedstats);
+       else
+               static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+       if (!schedstat_enabled()) {
+               pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+               static_branch_enable(&sched_schedstats);
+       }
+}
+
+static int __init setup_schedstats(char *str)
+{
+       int ret = 0;
+       if (!str)
+               goto out;
+
+       if (!strcmp(str, "enable")) {
+               set_schedstats(true);
+               ret = 1;
+       } else if (!strcmp(str, "disable")) {
+               set_schedstats(false);
+               ret = 1;
+       }
+out:
+       if (!ret)
+               pr_warn("Unable to parse schedstats=\n");
+
+       return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int err;
+       int state = static_branch_likely(&sched_schedstats);
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       t = *table;
+       t.data = &state;
+       err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+       if (write)
+               set_schedstats(state);
+       return err;
+}
+#endif
+#endif
+
  /*
   * fork()/clone()-time setup:
   */
@@ -3010,16 +2937,6 @@ u64 scheduler_tick_max_deferment(void)
  }
  #endif
  
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-       if (in_lock_functions(addr)) {
-               addr = CALLER_ADDR2;
-               if (in_lock_functions(addr))
-                       addr = CALLER_ADDR3;
-       }
-       return addr;
-}
-
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
  
@@ -3041,7 +2958,7 @@ void preempt_count_add(int val)
                                 PREEMPT_MASK - 10);
  #endif
         if (preempt_count() == val) {
-               unsigned long ip = get_parent_ip(CALLER_ADDR1);
+               unsigned long ip = get_lock_parent_ip();
  #ifdef CONFIG_DEBUG_PREEMPT
                 current->preempt_disable_ip = ip;
  #endif
@@ -3068,7 +2985,7 @@ void preempt_count_sub(int val)
  #endif
  
         if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
         __preempt_count_sub(val);
  }
  EXPORT_SYMBOL(preempt_count_sub);
@@ -3280,7 +3197,6 @@ static void __sched notrace __schedule(bool preempt)
  
                 trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
-               cpu = cpu_of(rq);
         } else {
                 lockdep_unpin_lock(&rq->lock);
                 raw_spin_unlock_irq(&rq->lock);
@@ -3466,7 +3382,7 @@ EXPORT_SYMBOL(default_wake_function);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
         struct rq *rq;
         const struct sched_class *prev_class;
  
@@ -3494,11 +3410,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
+
+       if (oldprio == prio)
+               queue_flag &= ~DEQUEUE_MOVE;
+
         prev_class = p->sched_class;
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flag);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3516,7 +3436,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       enqueue_flag |= ENQUEUE_REPLENISH;
+                       queue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@ -3524,7 +3444,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
-                       enqueue_flag |= ENQUEUE_HEAD;
+                       queue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@ -3539,7 +3459,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, enqueue_flag);
+               enqueue_task(rq, p, queue_flag);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@ -3895,6 +3815,7 @@ static int __sched_setscheduler(struct task_struct *p,
         const struct sched_class *prev_class;
         struct rq *rq;
         int reset_on_fork;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
  
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
@@ -4077,17 +3998,14 @@ change:
                  * itself.
                  */
                 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-               if (new_effective_prio == oldprio) {
-                       __setscheduler_params(p, attr);
-                       task_rq_unlock(rq, p, &flags);
-                       return 0;
-               }
+               if (new_effective_prio == oldprio)
+                       queue_flags &= ~DEQUEUE_MOVE;
         }
  
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flags);
         if (running)
                 put_prev_task(rq, p);
  
@@ -4097,15 +4015,14 @@ change:
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
-               int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
-               if (oldprio <= p->prio)
-                       enqueue_flags |= ENQUEUE_HEAD;
+               if (oldprio < p->prio)
+                       queue_flags |= ENQUEUE_HEAD;
  
-               enqueue_task(rq, p, enqueue_flags);
+               enqueue_task(rq, p, queue_flags);
         }
  
         check_class_changed(rq, p, prev_class, oldprio);
@@ -5096,6 +5013,8 @@ void init_idle(struct task_struct *idle, int cpu)
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
+       kasan_unpoison_task_stack(idle);
+
  #ifdef CONFIG_SMP
         /*
          * Its possible that init_idle() gets called multiple times on a task,
@@ -5405,183 +5324,6 @@ static void migrate_tasks(struct rq *dead_rq)
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-       {
-               .procname       = "sched_domain",
-               .mode           = 0555,
-       },
-       {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-       {
-               .procname       = "kernel",
-               .mode           = 0555,
-               .child          = sd_ctl_dir,
-       },
-       {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-       struct ctl_table *entry =
-               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-       return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-       struct ctl_table *entry;
-
-       /*
-        * In the intermediate directories, both the child directory and
-        * procname are dynamically allocated and could fail but the mode
-        * will always be set. In the lowest directory the names are
-        * static strings and all have proc handlers.
-        */
-       for (entry = *tablep; entry->mode; entry++) {
-               if (entry->child)
-                       sd_free_ctl_entry(&entry->child);
-               if (entry->proc_handler == NULL)
-                       kfree(entry->procname);
-       }
-
-       kfree(*tablep);
-       *tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-               const char *procname, void *data, int maxlen,
-               umode_t mode, proc_handler *proc_handler,
-               bool load_idx)
-{
-       entry->procname = procname;
-       entry->data = data;
-       entry->maxlen = maxlen;
-       entry->mode = mode;
-       entry->proc_handler = proc_handler;
-
-       if (load_idx) {
-               entry->extra1 = &min_load_idx;
-               entry->extra2 = &max_load_idx;
-       }
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-       struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-       if (table == NULL)
-               return NULL;
-
-       set_table_entry(&table[0], "min_interval", &sd->min_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[1], "max_interval", &sd->max_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[9], "cache_nice_tries",
-               &sd->cache_nice_tries,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[10], "flags", &sd->flags,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[11], "max_newidle_lb_cost",
-               &sd->max_newidle_lb_cost,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[12], "name", sd->name,
-               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[13] is terminator */
-
-       return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-       struct ctl_table *entry, *table;
-       struct sched_domain *sd;
-       int domain_num = 0, i;
-       char buf[32];
-
-       for_each_domain(cpu, sd)
-               domain_num++;
-       entry = table = sd_alloc_ctl_entry(domain_num + 1);
-       if (table == NULL)
-               return NULL;
-
-       i = 0;
-       for_each_domain(cpu, sd) {
-               snprintf(buf, 32, "domain%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_domain_table(sd);
-               entry++;
-               i++;
-       }
-       return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-       int i, cpu_num = num_possible_cpus();
-       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-       char buf[32];
-
-       WARN_ON(sd_ctl_dir[0].child);
-       sd_ctl_dir[0].child = entry;
-
-       if (entry == NULL)
-               return;
-
-       for_each_possible_cpu(i) {
-               snprintf(buf, 32, "cpu%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_cpu_table(i);
-               entry++;
-       }
-
-       WARN_ON(sd_sysctl_header);
-       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-       unregister_sysctl_table(sd_sysctl_header);
-       sd_sysctl_header = NULL;
-       if (sd_ctl_dir[0].child)
-               sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
  static void set_rq_online(struct rq *rq)
  {
         if (!rq->online) {
@@ -6173,11 +5915,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
+       int ret;
+
         alloc_bootmem_cpumask_var(&cpu_isolated_map);
-       cpulist_parse(str, cpu_isolated_map);
+       ret = cpulist_parse(str, cpu_isolated_map);
+       if (ret) {
+               pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+               return 0;
+       }
         return 1;
  }
-
  __setup("isolcpus=", isolated_cpu_setup);
  
  struct s_data {
@@ -7860,11 +7607,9 @@ void sched_destroy_group(struct task_group *tg)
  void sched_offline_group(struct task_group *tg)
  {
         unsigned long flags;
-       int i;
  
         /* end participation in shares distribution */
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
+       unregister_fair_sched_group(tg);
  
         spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
@@ -7890,7 +7635,7 @@ void sched_move_task(struct task_struct *tsk)
         queued = task_on_rq_queued(tsk);
  
         if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
  
@@ -7914,7 +7659,7 @@ void sched_move_task(struct task_struct *tsk)
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
  
         task_rq_unlock(rq, tsk, &flags);
  }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index b2ab2ffb1adc021de289d3e3c857de60ad2428b6..75f98c5498d55d38b0a36bba2719907742166fe1 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
  #ifdef CONFIG_PARAVIRT
         if (static_key_false(&paravirt_steal_enabled)) {
                 u64 steal;
-               cputime_t steal_ct;
+               unsigned long steal_jiffies;
  
                 steal = paravirt_steal_clock(smp_processor_id());
                 steal -= this_rq()->prev_steal_time;
  
                 /*
-                * cputime_t may be less precise than nsecs (eg: if it's
-                * based on jiffies). Lets cast the result to cputime
+                * steal is in nsecs but our caller is expecting steal
+                * time in jiffies. Lets cast the result to jiffies
                  * granularity and account the rest on the next rounds.
                  */
-               steal_ct = nsecs_to_cputime(steal);
-               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+               steal_jiffies = nsecs_to_jiffies(steal);
+               this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
  
-               account_steal_time(steal_ct);
-               return steal_ct;
+               account_steal_time(jiffies_to_cputime(steal_jiffies));
+               return steal_jiffies;
         }
  #endif
         return false;
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
  #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
  
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
  {
-       unsigned long long clock;
+       unsigned long now = READ_ONCE(jiffies);
  
-       clock = local_clock();
-       if (clock < tsk->vtime_snap)
+       if (time_before(now, (unsigned long)tsk->vtime_snap))
                 return 0;
  
-       return clock - tsk->vtime_snap;
+       return jiffies_to_cputime(now - tsk->vtime_snap);
  }
  
  static cputime_t get_vtime_delta(struct task_struct *tsk)
  {
-       unsigned long long delta = vtime_delta(tsk);
+       unsigned long now = READ_ONCE(jiffies);
+       unsigned long delta = now - tsk->vtime_snap;
  
         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-       tsk->vtime_snap += delta;
+       tsk->vtime_snap = now;
  
-       /* CHECKME: always safe to convert nsecs to cputime? */
-       return nsecs_to_cputime(delta);
+       return jiffies_to_cputime(delta);
  }
  
  static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
  
  void vtime_account_system(struct task_struct *tsk)
  {
+       if (!vtime_delta(tsk))
+               return;
+
         write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
         write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
  void vtime_gen_account_irq_exit(struct task_struct *tsk)
  {
         write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
         if (context_tracking_in_user())
                 tsk->vtime_snap_whence = VTIME_USER;
         write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
         cputime_t delta_cpu;
  
         write_seqcount_begin(&tsk->vtime_seqcount);
-       delta_cpu = get_vtime_delta(tsk);
         tsk->vtime_snap_whence = VTIME_SYS;
-       account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+       if (vtime_delta(tsk)) {
+               delta_cpu = get_vtime_delta(tsk);
+               account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+       }
         write_seqcount_end(&tsk->vtime_seqcount);
  }
  
  void vtime_user_enter(struct task_struct *tsk)
  {
         write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
         tsk->vtime_snap_whence = VTIME_USER;
         write_seqcount_end(&tsk->vtime_seqcount);
  }
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
          * that can thus safely catch up with a tickless delta.
          */
         write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
         current->flags |= PF_VCPU;
         write_seqcount_end(&tsk->vtime_seqcount);
  }
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
  
         write_seqcount_begin(&current->vtime_seqcount);
         current->vtime_snap_whence = VTIME_SYS;
-       current->vtime_snap = sched_clock_cpu(smp_processor_id());
+       current->vtime_snap = jiffies;
         write_seqcount_end(&current->vtime_seqcount);
  }
  
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
         local_irq_save(flags);
         write_seqcount_begin(&t->vtime_seqcount);
         t->vtime_snap_whence = VTIME_SYS;
-       t->vtime_snap = sched_clock_cpu(cpu);
+       t->vtime_snap = jiffies;
         write_seqcount_end(&t->vtime_seqcount);
         local_irq_restore(flags);
  }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 57b939c81bce5d06fa587df8915f05affbe22b82..c7a036facbe1d311e24a56d470288ecf8d43d0d3 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
  
-       WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+       WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+       /*
+        * We are racing with the deadline timer. So, do nothing because
+        * the deadline timer handler will take care of properly recharging
+        * the runtime and postponing the deadline
+        */
+       if (dl_se->dl_throttled)
+               return;
  
         /*
          * We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
          */
         dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
         dl_se->runtime = pi_se->dl_runtime;
-       dl_se->dl_new = 0;
  }
  
  /*
@@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                 dl_se->runtime = pi_se->dl_runtime;
         }
  
+       if (dl_se->dl_yielded && dl_se->runtime > 0)
+               dl_se->runtime = 0;
+
         /*
          * We keep moving the deadline away until we get some
          * available runtime for the entity. This ensures correct
@@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
  
-       /*
-        * The arrival of a new instance needs special treatment, i.e.,
-        * the actual scheduling parameters have to be "renewed".
-        */
-       if (dl_se->dl_new) {
-               setup_new_dl_entity(dl_se, pi_se);
-               return;
-       }
-
         if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
             dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
                 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -604,16 +605,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                 goto unlock;
         }
  
-       /*
-        * This is possible if switched_from_dl() raced against a running
-        * callback that took the above !dl_task() path and we've since then
-        * switched back into SCHED_DEADLINE.
-        *
-        * There's nothing to do except drop our task reference.
-        */
-       if (dl_se->dl_new)
-               goto unlock;
-
         /*
          * The task might have been boosted by someone else and might be in the
          * boosting/deboosting path, its not throttled.
@@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
          * approach need further study.
          */
         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-       if (unlikely((s64)delta_exec <= 0))
+       if (unlikely((s64)delta_exec <= 0)) {
+               if (unlikely(dl_se->dl_yielded))
+                       goto throttle;
                 return;
+       }
  
         schedstat_set(curr->se.statistics.exec_max,
                       max(curr->se.statistics.exec_max, delta_exec));
@@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
  
         sched_rt_avg_update(rq, delta_exec);
  
-       dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-       if (dl_runtime_exceeded(dl_se)) {
+       dl_se->runtime -= delta_exec;
+
+throttle:
+       if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
                 dl_se->dl_throttled = 1;
                 __dequeue_task_dl(rq, curr, 0);
                 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
          * parameters of the task might need updating. Otherwise,
          * we want a replenishment of its runtime.
          */
-       if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+       if (flags & ENQUEUE_WAKEUP)
                 update_dl_entity(dl_se, pi_se);
         else if (flags & ENQUEUE_REPLENISH)
                 replenish_dl_entity(dl_se, pi_se);
@@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
   */
  static void yield_task_dl(struct rq *rq)
  {
-       struct task_struct *p = rq->curr;
-
         /*
          * We make the task go to sleep until its current deadline by
          * forcing its runtime to zero. This way, update_curr_dl() stops
          * it and the bandwidth timer will wake it up and will give it
          * new scheduling parameters (thanks to dl_yielded=1).
          */
-       if (p->dl.runtime > 0) {
-               rq->curr->dl.dl_yielded = 1;
-               p->dl.runtime = 0;
-       }
+       rq->curr->dl.dl_yielded = 1;
+
         update_rq_clock(rq);
         update_curr_dl(rq);
         /*
@@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
   */
  static void switched_to_dl(struct rq *rq, struct task_struct *p)
  {
+       if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+               setup_new_dl_entity(&p->dl, &p->dl);
+
         if (task_on_rq_queued(p) && rq->curr != p) {
  #ifdef CONFIG_SMP
                 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                  */
                 resched_curr(rq);
  #endif /* CONFIG_SMP */
-       } else
-               switched_to_dl(rq, p);
+       }
  }
  
  const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 641511771ae6a696271f77532ac9e40e28175749..4fbc3bd5ff6067dfe184295fc262987c912b669e 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
  #include <linux/kallsyms.h>
  #include <linux/utsname.h>
  #include <linux/mempolicy.h>
+#include <linux/debugfs.h>
  
  #include "sched.h"
  
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
  
  #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
  
+#define SCHED_FEAT(name, enabled)      \
+       #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+       int i;
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
+       }
+       seq_puts(m, "\n");
+
+       return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)      \
+       jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+       static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+       static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+       int i;
+       int neg = 0;
+
+       if (strncmp(cmp, "NO_", 3) == 0) {
+               neg = 1;
+               cmp += 3;
+       }
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
+                       if (neg) {
+                               sysctl_sched_features &= ~(1UL << i);
+                               sched_feat_disable(i);
+                       } else {
+                               sysctl_sched_features |= (1UL << i);
+                               sched_feat_enable(i);
+                       }
+                       break;
+               }
+       }
+
+       return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       char *cmp;
+       int i;
+       struct inode *inode;
+
+       if (cnt > 63)
+               cnt = 63;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+       cmp = strstrip(buf);
+
+       /* Ensure the static_key remains in a consistent state */
+       inode = file_inode(filp);
+       inode_lock(inode);
+       i = sched_feat_set(cmp);
+       inode_unlock(inode);
+       if (i == __SCHED_FEAT_NR)
+               return -EINVAL;
+
+       *ppos += cnt;
+
+       return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+       debugfs_create_file("sched_features", 0644, NULL, NULL,
+                       &sched_feat_fops);
+
+       return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+       {
+               .procname       = "sched_domain",
+               .mode           = 0555,
+       },
+       {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+       {
+               .procname       = "kernel",
+               .mode           = 0555,
+               .child          = sd_ctl_dir,
+       },
+       {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+       struct ctl_table *entry =
+               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+       return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+       struct ctl_table *entry;
+
+       /*
+        * In the intermediate directories, both the child directory and
+        * procname are dynamically allocated and could fail but the mode
+        * will always be set. In the lowest directory the names are
+        * static strings and all have proc handlers.
+        */
+       for (entry = *tablep; entry->mode; entry++) {
+               if (entry->child)
+                       sd_free_ctl_entry(&entry->child);
+               if (entry->proc_handler == NULL)
+                       kfree(entry->procname);
+       }
+
+       kfree(*tablep);
+       *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+               const char *procname, void *data, int maxlen,
+               umode_t mode, proc_handler *proc_handler,
+               bool load_idx)
+{
+       entry->procname = procname;
+       entry->data = data;
+       entry->maxlen = maxlen;
+       entry->mode = mode;
+       entry->proc_handler = proc_handler;
+
+       if (load_idx) {
+               entry->extra1 = &min_load_idx;
+               entry->extra2 = &max_load_idx;
+       }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+       if (table == NULL)
+               return NULL;
+
+       set_table_entry(&table[0], "min_interval", &sd->min_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[1], "max_interval", &sd->max_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[9], "cache_nice_tries",
+               &sd->cache_nice_tries,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[10], "flags", &sd->flags,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[11], "max_newidle_lb_cost",
+               &sd->max_newidle_lb_cost,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[12], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+       /* &table[13] is terminator */
+
+       return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+       struct ctl_table *entry, *table;
+       struct sched_domain *sd;
+       int domain_num = 0, i;
+       char buf[32];
+
+       for_each_domain(cpu, sd)
+               domain_num++;
+       entry = table = sd_alloc_ctl_entry(domain_num + 1);
+       if (table == NULL)
+               return NULL;
+
+       i = 0;
+       for_each_domain(cpu, sd) {
+               snprintf(buf, 32, "domain%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_domain_table(sd);
+               entry++;
+               i++;
+       }
+       return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+       int i, cpu_num = num_possible_cpus();
+       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+       char buf[32];
+
+       WARN_ON(sd_ctl_dir[0].child);
+       sd_ctl_dir[0].child = entry;
+
+       if (entry == NULL)
+               return;
+
+       for_each_possible_cpu(i) {
+               snprintf(buf, 32, "cpu%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_cpu_table(i);
+               entry++;
+       }
+
+       WARN_ON(sd_sysctl_header);
+       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+       unregister_sysctl_table(sd_sysctl_header);
+       sd_sysctl_header = NULL;
+       if (sd_ctl_dir[0].child)
+               sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
  {
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
         PN(se->vruntime);
         PN(se->sum_exec_runtime);
  #ifdef CONFIG_SCHEDSTATS
-       PN(se->statistics.wait_start);
-       PN(se->statistics.sleep_start);
-       PN(se->statistics.block_start);
-       PN(se->statistics.sleep_max);
-       PN(se->statistics.block_max);
-       PN(se->statistics.exec_max);
-       PN(se->statistics.slice_max);
-       PN(se->statistics.wait_max);
-       PN(se->statistics.wait_sum);
-       P(se->statistics.wait_count);
+       if (schedstat_enabled()) {
+               PN(se->statistics.wait_start);
+               PN(se->statistics.sleep_start);
+               PN(se->statistics.block_start);
+               PN(se->statistics.sleep_max);
+               PN(se->statistics.block_max);
+               PN(se->statistics.exec_max);
+               PN(se->statistics.slice_max);
+               PN(se->statistics.wait_max);
+               PN(se->statistics.wait_sum);
+               P(se->statistics.wait_count);
+       }
  #endif
         P(se->load.weight);
  #ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                 (long long)(p->nvcsw + p->nivcsw),
                 p->prio);
  #ifdef CONFIG_SCHEDSTATS
-       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(p->se.statistics.wait_sum),
-               SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       if (schedstat_enabled()) {
+               SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+                       SPLIT_NS(p->se.statistics.wait_sum),
+                       SPLIT_NS(p->se.sum_exec_runtime),
+                       SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       }
  #else
         SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                 0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
  
  void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
  {
+       struct dl_bw *dl_bw;
+
         SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
         SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+       dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+       dl_bw = &dl_rq->dl_bw;
+#endif
+       SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+       SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
  }
  
  extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do {                                                                      \
  #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
  #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
  
-       P(yld_count);
-
-       P(sched_count);
-       P(sched_goidle);
  #ifdef CONFIG_SMP
         P64(avg_idle);
         P64(max_idle_balance_cost);
  #endif
  
-       P(ttwu_count);
-       P(ttwu_local);
+       if (schedstat_enabled()) {
+               P(yld_count);
+               P(sched_count);
+               P(sched_goidle);
+               P(ttwu_count);
+               P(ttwu_local);
+       }
  
  #undef P
  #undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
         nr_switches = p->nvcsw + p->nivcsw;
  
  #ifdef CONFIG_SCHEDSTATS
-       PN(se.statistics.sum_sleep_runtime);
-       PN(se.statistics.wait_start);
-       PN(se.statistics.sleep_start);
-       PN(se.statistics.block_start);
-       PN(se.statistics.sleep_max);
-       PN(se.statistics.block_max);
-       PN(se.statistics.exec_max);
-       PN(se.statistics.slice_max);
-       PN(se.statistics.wait_max);
-       PN(se.statistics.wait_sum);
-       P(se.statistics.wait_count);
-       PN(se.statistics.iowait_sum);
-       P(se.statistics.iowait_count);
         P(se.nr_migrations);
-       P(se.statistics.nr_migrations_cold);
-       P(se.statistics.nr_failed_migrations_affine);
-       P(se.statistics.nr_failed_migrations_running);
-       P(se.statistics.nr_failed_migrations_hot);
-       P(se.statistics.nr_forced_migrations);
-       P(se.statistics.nr_wakeups);
-       P(se.statistics.nr_wakeups_sync);
-       P(se.statistics.nr_wakeups_migrate);
-       P(se.statistics.nr_wakeups_local);
-       P(se.statistics.nr_wakeups_remote);
-       P(se.statistics.nr_wakeups_affine);
-       P(se.statistics.nr_wakeups_affine_attempts);
-       P(se.statistics.nr_wakeups_passive);
-       P(se.statistics.nr_wakeups_idle);
  
-       {
+       if (schedstat_enabled()) {
                 u64 avg_atom, avg_per_cpu;
  
+               PN(se.statistics.sum_sleep_runtime);
+               PN(se.statistics.wait_start);
+               PN(se.statistics.sleep_start);
+               PN(se.statistics.block_start);
+               PN(se.statistics.sleep_max);
+               PN(se.statistics.block_max);
+               PN(se.statistics.exec_max);
+               PN(se.statistics.slice_max);
+               PN(se.statistics.wait_max);
+               PN(se.statistics.wait_sum);
+               P(se.statistics.wait_count);
+               PN(se.statistics.iowait_sum);
+               P(se.statistics.iowait_count);
+               P(se.statistics.nr_migrations_cold);
+               P(se.statistics.nr_failed_migrations_affine);
+               P(se.statistics.nr_failed_migrations_running);
+               P(se.statistics.nr_failed_migrations_hot);
+               P(se.statistics.nr_forced_migrations);
+               P(se.statistics.nr_wakeups);
+               P(se.statistics.nr_wakeups_sync);
+               P(se.statistics.nr_wakeups_migrate);
+               P(se.statistics.nr_wakeups_local);
+               P(se.statistics.nr_wakeups_remote);
+               P(se.statistics.nr_wakeups_affine);
+               P(se.statistics.nr_wakeups_affine_attempts);
+               P(se.statistics.nr_wakeups_passive);
+               P(se.statistics.nr_wakeups_idle);
+
                 avg_atom = p->se.sum_exec_runtime;
                 if (nr_switches)
                         avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 56b7d4b839476b6ed1692e786abe9ed6cda64a5f..33130529e9b5628b53cd7031c4914da0de0e7148 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
   */
  
-#include <linux/latencytop.h>
  #include <linux/sched.h>
+#include <linux/latencytop.h>
  #include <linux/cpumask.h>
  #include <linux/cpuidle.h>
  #include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct task_struct *p;
-       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+       u64 delta;
+
+       delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
  
         if (entity_is_task(se)) {
                 p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
         se->statistics.wait_sum += delta;
         se->statistics.wait_start = 0;
  }
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
  
  /*
   * Task is being enqueued - update stats:
   */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         /*
          * Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
         /*
          * Mark the end of the wait period if dequeueing a
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
          */
         if (se != cfs_rq->curr)
                 update_stats_wait_end(cfs_rq, se);
+
+       if (flags & DEQUEUE_SLEEP) {
+               if (entity_is_task(se)) {
+                       struct task_struct *tsk = task_of(se);
+
+                       if (tsk->state & TASK_INTERRUPTIBLE)
+                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+                       if (tsk->state & TASK_UNINTERRUPTIBLE)
+                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+               }
+       }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
  }
  
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
  /*
   * We are picking a new current task - update its stats:
   */
@@ -907,10 +932,11 @@ struct numa_group {
         spinlock_t lock; /* nr_tasks, tasks */
         int nr_tasks;
         pid_t gid;
+       int active_nodes;
  
         struct rcu_head rcu;
-       nodemask_t active_nodes;
         unsigned long total_faults;
+       unsigned long max_faults_cpu;
         /*
          * Faults_cpu is used to decide whether memory should move
          * towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
  }
  
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+       return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
  /* Handle placement on systems where not all nodes are directly connected. */
  static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                                         int maxdist, bool task)
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                 return true;
  
         /*
-        * Do not migrate if the destination is not a node that
-        * is actively used by this numa group.
+        * Destination node is much more heavily used than the source
+        * node? Allow migration.
          */
-       if (!node_isset(dst_nid, ng->active_nodes))
-               return false;
-
-       /*
-        * Source is a node that is not actively used by this
-        * numa group, while the destination is. Migrate.
-        */
-       if (!node_isset(src_nid, ng->active_nodes))
+       if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+                                       ACTIVE_NODE_FRACTION)
                 return true;
  
         /*
-        * Both source and destination are nodes in active
-        * use by this numa group. Maximize memory bandwidth
-        * by migrating from more heavily used groups, to less
-        * heavily used ones, spreading the load around.
-        * Use a 1/4 hysteresis to avoid spurious page movement.
+        * Distribute memory according to CPU & memory use on each node,
+        * with 3/4 hysteresis to avoid unnecessary memory migrations:
+        *
+        * faults_cpu(dst)   3   faults_cpu(src)
+        * --------------- * - > ---------------
+        * faults_mem(dst)   4   faults_mem(src)
          */
-       return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+       return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+              group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
  }
  
  static unsigned long weighted_cpuload(const int cpu);
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
  
                 .best_task = NULL,
                 .best_imp = 0,
-               .best_cpu = -1
+               .best_cpu = -1,
         };
         struct sched_domain *sd;
         unsigned long taskweight, groupweight;
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
          *   multiple NUMA nodes; in order to better consolidate the group,
          *   we need to check other locations.
          */
-       if (env.best_cpu == -1 || (p->numa_group &&
-                       nodes_weight(p->numa_group->active_nodes) > 1)) {
+       if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
                 for_each_online_node(nid) {
                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
                                 continue;
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
          * trying for a better one later. Do not set the preferred node here.
          */
         if (p->numa_group) {
+               struct numa_group *ng = p->numa_group;
+
                 if (env.best_cpu == -1)
                         nid = env.src_nid;
                 else
                         nid = env.dst_nid;
  
-               if (node_isset(nid, p->numa_group->active_nodes))
+               if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
                         sched_setnuma(p, env.dst_nid);
         }
  
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
  }
  
  /*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
   * tracking the nodes from which NUMA hinting faults are triggered. This can
   * be different from the set of nodes where the workload's memory is currently
   * located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
   */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
  {
         unsigned long faults, max_faults = 0;
-       int nid;
+       int nid, active_nodes = 0;
  
         for_each_online_node(nid) {
                 faults = group_faults_cpu(numa_group, nid);
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
  
         for_each_online_node(nid) {
                 faults = group_faults_cpu(numa_group, nid);
-               if (!node_isset(nid, numa_group->active_nodes)) {
-                       if (faults > max_faults * 6 / 16)
-                               node_set(nid, numa_group->active_nodes);
-               } else if (faults < max_faults * 3 / 16)
-                       node_clear(nid, numa_group->active_nodes);
+               if (faults * ACTIVE_NODE_FRACTION > max_faults)
+                       active_nodes++;
         }
+
+       numa_group->max_faults_cpu = max_faults;
+       numa_group->active_nodes = active_nodes;
  }
  
  /*
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
         update_task_scan_period(p, fault_types[0], fault_types[1]);
  
         if (p->numa_group) {
-               update_numa_active_node_mask(p->numa_group);
+               numa_group_count_active_nodes(p->numa_group);
                 spin_unlock_irq(group_lock);
                 max_nid = preferred_group_nid(p, max_group_nid);
         }
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                         return;
  
                 atomic_set(&grp->refcount, 1);
+               grp->active_nodes = 1;
+               grp->max_faults_cpu = 0;
                 spin_lock_init(&grp->lock);
                 grp->gid = p->pid;
                 /* Second half of the array tracks nids where faults happen */
                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
                                                 nr_node_ids;
  
-               node_set(task_node(current), grp->active_nodes);
-
                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
                         grp->faults[i] = p->numa_faults[i];
  
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         bool migrated = flags & TNF_MIGRATED;
         int cpu_node = task_node(current);
         int local = !!(flags & TNF_FAULT_LOCAL);
+       struct numa_group *ng;
         int priv;
  
         if (!static_branch_likely(&sched_numa_balancing))
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
          * actively using should be counted as local. This allows the
          * scan rate to slow down when a workload has settled down.
          */
-       if (!priv && !local && p->numa_group &&
-                       node_isset(cpu_node, p->numa_group->active_nodes) &&
-                       node_isset(mem_node, p->numa_group->active_nodes))
+       ng = p->numa_group;
+       if (!priv && !local && ng && ng->active_nodes > 1 &&
+                               numa_is_active_node(cpu_node, ng) &&
+                               numa_is_active_node(mem_node, ng))
                 local = 1;
  
         task_numa_placement(p);
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
  
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
  
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+       if (schedstat_enabled())
+               return;
+
+       /* Force schedstat enabled if a dependent tracepoint is active */
+       if (trace_sched_stat_wait_enabled()    ||
+                       trace_sched_stat_sleep_enabled()   ||
+                       trace_sched_stat_iowait_enabled()  ||
+                       trace_sched_stat_blocked_enabled() ||
+                       trace_sched_stat_runtime_enabled())  {
+               pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                            "stat_blocked and stat_runtime require the "
+                            "kernel parameter schedstats=enabled or "
+                            "kernel.sched_schedstats=1\n");
+       }
+#endif
+}
+
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         if (flags & ENQUEUE_WAKEUP) {
                 place_entity(cfs_rq, se, 0);
-               enqueue_sleeper(cfs_rq, se);
+               if (schedstat_enabled())
+                       enqueue_sleeper(cfs_rq, se);
         }
  
-       update_stats_enqueue(cfs_rq, se);
-       check_spread(cfs_rq, se);
+       check_schedstat_required();
+       if (schedstat_enabled()) {
+               update_stats_enqueue(cfs_rq, se);
+               check_spread(cfs_rq, se);
+       }
         if (se != cfs_rq->curr)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         update_curr(cfs_rq);
         dequeue_entity_load_avg(cfs_rq, se);
  
-       update_stats_dequeue(cfs_rq, se);
-       if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
-
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-               }
-#endif
-       }
+       if (schedstat_enabled())
+               update_stats_dequeue(cfs_rq, se, flags);
  
         clear_buddies(cfs_rq, se);
  
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  * a CPU. So account for the time it spent waiting on the
                  * runqueue.
                  */
-               update_stats_wait_end(cfs_rq, se);
+               if (schedstat_enabled())
+                       update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
                 update_load_avg(se, 1);
         }
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
          * least twice that of our own weight (i.e. dont track it
          * when there are only lesser-weight tasks around):
          */
-       if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+       if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
                 se->statistics.slice_max = max(se->statistics.slice_max,
                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
         }
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
         /* throttle cfs_rqs exceeding runtime */
         check_cfs_rq_runtime(cfs_rq);
  
-       check_spread(cfs_rq, prev);
+       if (schedstat_enabled()) {
+               check_spread(cfs_rq, prev);
+               if (prev->on_rq)
+                       update_stats_wait_start(cfs_rq, prev);
+       }
+
         if (prev->on_rq) {
-               update_stats_wait_start(cfs_rq, prev);
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
                 /* in !on_rq case, update occurred at dequeue */
@@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
  
                 /* scale is effectively 1 << i now, and >> i divides by scale */
  
-               old_load = this_rq->cpu_load[i] - tickless_load;
+               old_load = this_rq->cpu_load[i];
                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               old_load += tickless_load;
+               if (tickless_load) {
+                       old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+                       /*
+                        * old_load can never be a negative value because a
+                        * decayed tickless_load cannot be greater than the
+                        * original tickless_load.
+                        */
+                       old_load += tickless_load;
+               }
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
  }
  
  #ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+                                  unsigned long curr_jiffies,
+                                  unsigned long load,
+                                  int active)
+{
+       unsigned long pending_updates;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * In the regular NOHZ case, we were idle, this means load 0.
+                * In the NOHZ_FULL case, we were non-idle, we should consider
+                * its weighted load.
+                */
+               __update_cpu_load(this_rq, load, pending_updates, active);
+       }
+}
+
  /*
   * There is no sane way to deal with nohz on smp when using jiffies because the
   * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
   * Called from nohz_idle_balance() to update the load ratings before doing the
   * idle balance.
   */
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
  {
-       unsigned long curr_jiffies = READ_ONCE(jiffies);
-       unsigned long load = weighted_cpuload(cpu_of(this_rq));
-       unsigned long pending_updates;
-
         /*
          * bail if there's load or we're actually up-to-date.
          */
-       if (load || curr_jiffies == this_rq->last_load_update_tick)
+       if (weighted_cpuload(cpu_of(this_rq)))
                 return;
  
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       __update_cpu_load(this_rq, load, pending_updates, 0);
+       __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
  }
  
  /*
@@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active)
         struct rq *this_rq = this_rq();
         unsigned long curr_jiffies = READ_ONCE(jiffies);
         unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
-       unsigned long pending_updates;
  
         if (curr_jiffies == this_rq->last_load_update_tick)
                 return;
  
         raw_spin_lock(&this_rq->lock);
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       if (pending_updates) {
-               this_rq->last_load_update_tick = curr_jiffies;
-               /*
-                * In the regular NOHZ case, we were idle, this means load 0.
-                * In the NOHZ_FULL case, we were non-idle, we should consider
-                * its weighted load.
-                */
-               __update_cpu_load(this_rq, load, pending_updates, active);
-       }
+       __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
         raw_spin_unlock(&this_rq->lock);
  }
  #endif /* CONFIG_NO_HZ */
@@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
  {
         unsigned long load = weighted_cpuload(cpu_of(this_rq));
         /*
-        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
          */
         this_rq->last_load_update_tick = jiffies;
         __update_cpu_load(this_rq, load, 1, 1);
@@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                 if (time_after_eq(jiffies, rq->next_balance)) {
                         raw_spin_lock_irq(&rq->lock);
                         update_rq_clock(rq);
-                       update_idle_cpu_load(rq);
+                       update_cpu_load_idle(rq);
                         raw_spin_unlock_irq(&rq->lock);
                         rebalance_domains(rq, CPU_IDLE);
                 }
@@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg)
         for_each_possible_cpu(i) {
                 if (tg->cfs_rq)
                         kfree(tg->cfs_rq[i]);
-               if (tg->se) {
-                       if (tg->se[i])
-                               remove_entity_load_avg(tg->se[i]);
+               if (tg->se)
                         kfree(tg->se[i]);
-               }
         }
  
         kfree(tg->cfs_rq);
@@ -8286,21 +8343,29 @@ err:
         return 0;
  }
  
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
  {
-       struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
+       struct rq *rq;
+       int cpu;
  
-       /*
-       * Only empty task groups can be destroyed; so we can speculatively
-       * check on_list without danger of it being re-added.
-       */
-       if (!tg->cfs_rq[cpu]->on_list)
-               return;
+       for_each_possible_cpu(cpu) {
+               if (tg->se[cpu])
+                       remove_entity_load_avg(tg->se[cpu]);
  
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+               /*
+                * Only empty task groups can be destroyed; so we can speculatively
+                * check on_list without danger of it being re-added.
+                */
+               if (!tg->cfs_rq[cpu]->on_list)
+                       continue;
+
+               rq = cpu_rq(cpu);
+
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
  }
  
  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 8ec86abe0ea188369ee4e7efd21789d8cb127c14..56247132948799036fc4cbcc0c3d7bfcb778bcc5 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
         raw_spin_lock(&rt_b->rt_runtime_lock);
         if (!rt_b->rt_period_active) {
                 rt_b->rt_period_active = 1;
-               hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+               /*
+                * SCHED_DEADLINE updates the bandwidth, as a run away
+                * RT task with a DL task could hog a CPU. But DL does
+                * not reset the period. If a deadline task was running
+                * without an RT task running, it can cause RT tasks to
+                * throttle when they start up. Kick the timer right away
+                * to update the period.
+                */
+               hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
                 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
         }
         raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
  
  static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  {
-       return !list_empty(&rt_se->run_list);
+       return rt_se->on_rq;
  }
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
         return rt_se->my_q;
  }
  
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
  
  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  {
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                 if (!rt_se)
                         enqueue_top_rt_rq(rt_rq);
                 else if (!on_rt_rq(rt_se))
-                       enqueue_rt_entity(rt_se, false);
+                       enqueue_rt_entity(rt_se, 0);
  
                 if (rt_rq->highest_prio.curr < curr->prio)
                         resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
         if (!rt_se)
                 dequeue_top_rt_rq(rt_rq);
         else if (on_rt_rq(rt_se))
-               dequeue_rt_entity(rt_se);
+               dequeue_rt_entity(rt_se, 0);
  }
  
  static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1141,6 +1149,20 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
                 return 1;
  }
  
+static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *group_rq = group_rt_rq(rt_se);
+       struct task_struct *tsk;
+
+       if (group_rq)
+               return group_rq->rr_nr_running;
+
+       tsk = rt_task_of(rt_se);
+
+       return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
  static inline
  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  {
@@ -1148,6 +1170,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  
         WARN_ON(!rt_prio(prio));
         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+       rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
  
         inc_rt_prio(rt_rq, prio);
         inc_rt_migration(rt_se, rt_rq);
@@ -1160,13 +1183,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
         WARN_ON(!rt_rq->rt_nr_running);
         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+       rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
  
         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
         dec_rt_migration(rt_se, rt_rq);
         dec_rt_group(rt_se, rt_rq);
  }
  
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+               return false;
+
+       return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+       list_del_init(&rt_se->run_list);
+
+       if (list_empty(array->queue + rt_se_prio(rt_se)))
+               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+       rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
         struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1226,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
          * get throttled and the current group doesn't have any other
          * active members.
          */
-       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+               if (rt_se->on_list)
+                       __delist_rt_entity(rt_se, array);
                 return;
+       }
  
-       if (head)
-               list_add(&rt_se->run_list, queue);
-       else
-               list_add_tail(&rt_se->run_list, queue);
-       __set_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(rt_se->on_list);
+               if (flags & ENQUEUE_HEAD)
+                       list_add(&rt_se->run_list, queue);
+               else
+                       list_add_tail(&rt_se->run_list, queue);
+
+               __set_bit(rt_se_prio(rt_se), array->bitmap);
+               rt_se->on_list = 1;
+       }
+       rt_se->on_rq = 1;
  
         inc_rt_tasks(rt_se, rt_rq);
  }
  
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
         struct rt_prio_array *array = &rt_rq->active;
  
-       list_del_init(&rt_se->run_list);
-       if (list_empty(array->queue + rt_se_prio(rt_se)))
-               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(!rt_se->on_list);
+               __delist_rt_entity(rt_se, array);
+       }
+       rt_se->on_rq = 0;
  
         dec_rt_tasks(rt_se, rt_rq);
  }
@@ -1207,7 +1265,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
   * Because the prio of an upper entry depends on the lower
   * entries, we must remove entries top - down.
   */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct sched_rt_entity *back = NULL;
  
@@ -1220,31 +1278,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
  
         for (rt_se = back; rt_se; rt_se = rt_se->back) {
                 if (on_rt_rq(rt_se))
-                       __dequeue_rt_entity(rt_se);
+                       __dequeue_rt_entity(rt_se, flags);
         }
  }
  
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rq *rq = rq_of_rt_se(rt_se);
  
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
         for_each_sched_rt_entity(rt_se)
-               __enqueue_rt_entity(rt_se, head);
+               __enqueue_rt_entity(rt_se, flags);
         enqueue_top_rt_rq(&rq->rt);
  }
  
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rq *rq = rq_of_rt_se(rt_se);
  
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
  
         for_each_sched_rt_entity(rt_se) {
                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
  
                 if (rt_rq && rt_rq->rt_nr_running)
-                       __enqueue_rt_entity(rt_se, false);
+                       __enqueue_rt_entity(rt_se, flags);
         }
         enqueue_top_rt_rq(&rq->rt);
  }
@@ -1260,7 +1318,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
         if (flags & ENQUEUE_WAKEUP)
                 rt_se->timeout = 0;
  
-       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+       enqueue_rt_entity(rt_se, flags);
  
         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
@@ -1271,7 +1329,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
         struct sched_rt_entity *rt_se = &p->rt;
  
         update_curr_rt(rq);
-       dequeue_rt_entity(rt_se);
+       dequeue_rt_entity(rt_se, flags);
  
         dequeue_pushable_task(rq, p);
  }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 10f16374df7f3a3f0f3dc0eb281559cacd223311..b2ff5a2bd6df20b89bbd8ed838712ac83e905d75 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
  #include <linux/sched/sysctl.h>
  #include <linux/sched/rt.h>
  #include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
  #include <linux/mutex.h>
  #include <linux/spinlock.h>
  #include <linux/stop_machine.h>
@@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
  
  extern void free_fair_sched_group(struct task_group *tg);
  extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
  extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                         struct sched_entity *se, int cpu,
                         struct sched_entity *parent);
  extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
  
  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
  extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)
  struct rt_rq {
         struct rt_prio_array active;
         unsigned int rt_nr_running;
+       unsigned int rr_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
         struct {
                 int curr; /* highest queued rt task prio */
@@ -909,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
  
  extern int group_balance_cpu(struct sched_group *sg);
  
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
  #else
  
  static inline void sched_ttwu_pending(void) { }
@@ -1022,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
  
  extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
  
  static inline u64 global_rt_period(void)
  {
@@ -1130,18 +1144,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  extern const int sched_prio_to_weight[40];
  extern const u32 sched_prio_to_wmult[40];
  
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP          0x01
+#define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+
  #define ENQUEUE_WAKEUP         0x01
-#define ENQUEUE_HEAD           0x02
+#define ENQUEUE_RESTORE                0x02
+#define ENQUEUE_MOVE           0x04
+
+#define ENQUEUE_HEAD           0x08
+#define ENQUEUE_REPLENISH      0x10
  #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING         0x04    /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING         0x20
  #else
  #define ENQUEUE_WAKING         0x00
  #endif
-#define ENQUEUE_REPLENISH      0x08
-#define ENQUEUE_RESTORE        0x10
-
-#define DEQUEUE_SLEEP          0x01
-#define DEQUEUE_SAVE           0x02
  
  #define RETRY_TASK             ((void *)-1UL)
  
@@ -1278,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime);
  
  extern void init_entity_runnable_average(struct sched_entity *se);
  
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+       int cpu;
+
+       if (!tick_nohz_full_enabled())
+               return;
+
+       cpu = cpu_of(rq);
+
+       if (!tick_nohz_full_cpu(cpu))
+               return;
+
+       if (sched_can_stop_tick(rq))
+               tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+       else
+               tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
  static inline void add_nr_running(struct rq *rq, unsigned count)
  {
         unsigned prev_nr = rq->nr_running;
@@ -1289,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
                 if (!rq->rd->overload)
                         rq->rd->overload = true;
  #endif
-
-#ifdef CONFIG_NO_HZ_FULL
-               if (tick_nohz_full_cpu(rq->cpu)) {
-                       /*
-                        * Tick is needed if more than one task runs on a CPU.
-                        * Send the target an IPI to kick it out of nohz mode.
-                        *
-                        * We assume that IPI implies full memory barrier and the
-                        * new value of rq->nr_running is visible on reception
-                        * from the target.
-                        */
-                       tick_nohz_full_kick_cpu(rq->cpu);
-               }
-#endif
         }
+
+       sched_update_tick_dependency(rq);
  }
  
  static inline void sub_nr_running(struct rq *rq, unsigned count)
  {
         rq->nr_running -= count;
+       /* Check if we still need preemption */
+       sched_update_tick_dependency(rq);
  }
  
  static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index b0fbc7632de5f9b13d8ccd2c42d73560c347669a..70b3b6a20fb0e362f4c816fe0f069c7c8576c7f4 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
         if (rq)
                 rq->rq_sched_info.run_delay += delta;
  }
-# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)       do { var = (val); } while (0)
+# define schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)      do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
  #else /* !CONFIG_SCHEDSTATS */
  static inline void
  rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
  static inline void
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {}
+# define schedstat_enabled()           0
  # define schedstat_inc(rq, field)      do { } while (0)
  # define schedstat_add(rq, field, amt) do { } while (0)
  # define schedstat_set(var, val)       do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c

new file mode 100644 (file)

index 0000000..82f0dff
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                            struct lock_class_key *key)
+{
+       raw_spin_lock_init(&q->lock);
+       lockdep_set_class_and_name(&q->lock, key, name);
+       INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+       struct swait_queue *curr;
+
+       if (list_empty(&q->task_list))
+               return;
+
+       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+       wake_up_process(curr->task);
+       list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+       unsigned long flags;
+
+       if (!swait_active(q))
+               return;
+
+       raw_spin_lock_irqsave(&q->lock, flags);
+       swake_up_locked(q);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+       struct swait_queue *curr;
+       LIST_HEAD(tmp);
+
+       if (!swait_active(q))
+               return;
+
+       raw_spin_lock_irq(&q->lock);
+       list_splice_init(&q->task_list, &tmp);
+       while (!list_empty(&tmp)) {
+               curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+               wake_up_state(curr->task, TASK_NORMAL);
+               list_del_init(&curr->task_list);
+
+               if (list_empty(&tmp))
+                       break;
+
+               raw_spin_unlock_irq(&q->lock);
+               raw_spin_lock_irq(&q->lock);
+       }
+       raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       wait->task = current;
+       if (list_empty(&wait->task_list))
+               list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&q->lock, flags);
+       __prepare_to_swait(q, wait);
+       set_current_state(state);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+       if (signal_pending_state(state, current))
+               return -ERESTARTSYS;
+
+       prepare_to_swait(q, wait, state);
+
+       return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       __set_current_state(TASK_RUNNING);
+       if (!list_empty(&wait->task_list))
+               list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       unsigned long flags;
+
+       __set_current_state(TASK_RUNNING);
+
+       if (!list_empty_careful(&wait->task_list)) {
+               raw_spin_lock_irqsave(&q->lock, flags);
+               list_del_init(&wait->task_list);
+               raw_spin_unlock_irqrestore(&q->lock, flags);
+       }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/smp.c b/kernel/smp.c

index d903c02223afbaa2776b2610f00ae3def7de442e..300d29391e07416d73177c752ecca4afee21d3dd 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -105,13 +105,12 @@ void __init call_function_init(void)
   * previous function call. For multi-cpu calls its even more interesting
   * as we'll have to ensure no other cpu is observing our csd.
   */
-static void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(struct call_single_data *csd)
  {
-       while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
-               cpu_relax();
+       smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
  }
  
-static void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(struct call_single_data *csd)
  {
         csd_lock_wait(csd);
         csd->flags |= CSD_FLAG_LOCK;
@@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd)
         smp_wmb();
  }
  
-static void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(struct call_single_data *csd)
  {
         WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
  
diff --git a/kernel/softirq.c b/kernel/softirq.c

index 479e4436f787646c92c42e0dbe2d940b366fbf60..8aae49dd7da862955f9ec49c65b0d80ad8effcd6 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
  
         if (preempt_count() == cnt) {
  #ifdef CONFIG_DEBUG_PREEMPT
-               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+               current->preempt_disable_ip = get_lock_parent_ip();
  #endif
-               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
         }
  }
  EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 97715fd9e790ade5d7cd7731107de2dafb8272e3..f5102fabef7f525f6c79d66756336c262e465caa 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+#ifdef CONFIG_SCHEDSTATS
+       {
+               .procname       = "sched_schedstats",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_schedstats,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif /* CONFIG_SCHEDSTATS */
  #endif /* CONFIG_SMP */
  #ifdef CONFIG_NUMA_BALANCING
         {
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
                 .data           = &latencytop_enabled,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = sysctl_latencytop,
         },
  #endif
  #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c

index f5e86d282d520a7881557de76096c1cf46f58fa7..1cafba860b08ceb6030fa2a1f237e3950a4e0aa7 100644 (file)
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
         return err;
  }
  
-
  /*
   * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
   * This is called from sys_timer_create() and do_cpu_nanosleep() with the
@@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer)
                                 cputime_expires->sched_exp = exp;
                         break;
                 }
+               if (CPUCLOCK_PERTHREAD(timer->it_clock))
+                       tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+               else
+                       tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
         }
  }
  
@@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
         return 0;
  }
  
-#ifdef CONFIG_NO_HZ_FULL
-static void nohz_kick_work_fn(struct work_struct *work)
-{
-       tick_nohz_full_kick_all();
-}
-
-static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
-
-/*
- * We need the IPIs to be sent from sane process context.
- * The posix cpu timers are always set with irqs disabled.
- */
-static void posix_cpu_timer_kick_nohz(void)
-{
-       if (context_tracking_is_enabled())
-               schedule_work(&nohz_kick_work);
-}
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
-{
-       if (!task_cputime_zero(&tsk->cputime_expires))
-               return false;
-
-       /* Check if cputimer is running. This is accessed without locking. */
-       if (READ_ONCE(tsk->signal->cputimer.running))
-               return false;
-
-       return true;
-}
-#else
-static inline void posix_cpu_timer_kick_nohz(void) { }
-#endif
-
  /*
   * Guts of sys_timer_settime for CPU timers.
   * This is called with the timer locked and interrupts disabled.
@@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                 sample_to_timespec(timer->it_clock,
                                    old_incr, &old->it_interval);
         }
-       if (!ret)
-               posix_cpu_timer_kick_nohz();
+
         return ret;
  }
  
@@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk,
                         __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                 }
         }
+       if (task_cputime_zero(tsk_expires))
+               tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
  }
  
  static inline void stop_process_timers(struct signal_struct *sig)
@@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
  
         /* Turn off cputimer->running. This is done without locking. */
         WRITE_ONCE(cputimer->running, false);
+       tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
  }
  
  static u32 onecputick;
@@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
         arm_timer(timer);
         unlock_task_sighand(p, &flags);
  
-       /* Kick full dynticks CPUs in case they need to tick on the new timer */
-       posix_cpu_timer_kick_nohz();
  out:
         timer->it_overrun_last = timer->it_overrun;
         timer->it_overrun = -1;
@@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 }
  
                 if (!*newval)
-                       goto out;
+                       return;
                 *newval += now;
         }
  
@@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                         tsk->signal->cputime_expires.virt_exp = *newval;
                 break;
         }
-out:
-       posix_cpu_timer_kick_nohz();
+
+       tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
  }
  
  static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 0b17424349eb4dafd76a2cf588748988ce51a295..969e6704c3c9a24754d0da31e0dd028257425ddd 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -22,7 +22,6 @@
  #include <linux/module.h>
  #include <linux/irq_work.h>
  #include <linux/posix-timers.h>
-#include <linux/perf_event.h>
  #include <linux/context_tracking.h>
  
  #include <asm/irq_regs.h>
@@ -158,54 +157,63 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
  cpumask_var_t tick_nohz_full_mask;
  cpumask_var_t housekeeping_mask;
  bool tick_nohz_full_running;
+static unsigned long tick_dep_mask;
  
-static bool can_stop_full_tick(void)
+static void trace_tick_dependency(unsigned long dep)
+{
+       if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+               trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+               trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_SCHED) {
+               trace_tick_stop(0, TICK_DEP_MASK_SCHED);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+               trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+}
+
+static bool can_stop_full_tick(struct tick_sched *ts)
  {
         WARN_ON_ONCE(!irqs_disabled());
  
-       if (!sched_can_stop_tick()) {
-               trace_tick_stop(0, "more than 1 task in runqueue\n");
+       if (tick_dep_mask) {
+               trace_tick_dependency(tick_dep_mask);
                 return false;
         }
  
-       if (!posix_cpu_timers_can_stop_tick(current)) {
-               trace_tick_stop(0, "posix timers running\n");
+       if (ts->tick_dep_mask) {
+               trace_tick_dependency(ts->tick_dep_mask);
                 return false;
         }
  
-       if (!perf_event_can_stop_tick()) {
-               trace_tick_stop(0, "perf events running\n");
+       if (current->tick_dep_mask) {
+               trace_tick_dependency(current->tick_dep_mask);
                 return false;
         }
  
-       /* sched_clock_tick() needs us? */
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-       /*
-        * TODO: kick full dynticks CPUs when
-        * sched_clock_stable is set.
-        */
-       if (!sched_clock_stable()) {
-               trace_tick_stop(0, "unstable sched clock\n");
-               /*
-                * Don't allow the user to think they can get
-                * full NO_HZ with this machine.
-                */
-               WARN_ONCE(tick_nohz_full_running,
-                         "NO_HZ FULL will not work with unstable sched clock");
+       if (current->signal->tick_dep_mask) {
+               trace_tick_dependency(current->signal->tick_dep_mask);
                 return false;
         }
-#endif
  
         return true;
  }
  
-static void nohz_full_kick_work_func(struct irq_work *work)
+static void nohz_full_kick_func(struct irq_work *work)
  {
         /* Empty, the tick restart happens on tick_nohz_irq_exit() */
  }
  
  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-       .func = nohz_full_kick_work_func,
+       .func = nohz_full_kick_func,
  };
  
  /*
@@ -214,7 +222,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
   * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
   * is NMI safe.
   */
-void tick_nohz_full_kick(void)
+static void tick_nohz_full_kick(void)
  {
         if (!tick_nohz_full_cpu(smp_processor_id()))
                 return;
@@ -234,27 +242,112 @@ void tick_nohz_full_kick_cpu(int cpu)
         irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
  }
  
-static void nohz_full_kick_ipi(void *info)
-{
-       /* Empty, the tick restart happens on tick_nohz_irq_exit() */
-}
-
  /*
   * Kick all full dynticks CPUs in order to force these to re-evaluate
   * their dependency on the tick and restart it if necessary.
   */
-void tick_nohz_full_kick_all(void)
+static void tick_nohz_full_kick_all(void)
  {
+       int cpu;
+
         if (!tick_nohz_full_running)
                 return;
  
         preempt_disable();
-       smp_call_function_many(tick_nohz_full_mask,
-                              nohz_full_kick_ipi, NULL, false);
-       tick_nohz_full_kick();
+       for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
+               tick_nohz_full_kick_cpu(cpu);
         preempt_enable();
  }
  
+static void tick_nohz_dep_set_all(unsigned long *dep,
+                                 enum tick_dep_bits bit)
+{
+       unsigned long prev;
+
+       prev = fetch_or(dep, BIT_MASK(bit));
+       if (!prev)
+               tick_nohz_full_kick_all();
+}
+
+/*
+ * Set a global tick dependency. Used by perf events that rely on freq and
+ * by unstable clock.
+ */
+void tick_nohz_dep_set(enum tick_dep_bits bit)
+{
+       tick_nohz_dep_set_all(&tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear(enum tick_dep_bits bit)
+{
+       clear_bit(bit, &tick_dep_mask);
+}
+
+/*
+ * Set per-CPU tick dependency. Used by scheduler and perf events in order to
+ * manage events throttling.
+ */
+void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+       unsigned long prev;
+       struct tick_sched *ts;
+
+       ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+       prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+       if (!prev) {
+               preempt_disable();
+               /* Perf needs local kick that is NMI safe */
+               if (cpu == smp_processor_id()) {
+                       tick_nohz_full_kick();
+               } else {
+                       /* Remote irq work not NMI-safe */
+                       if (!WARN_ON_ONCE(in_nmi()))
+                               tick_nohz_full_kick_cpu(cpu);
+               }
+               preempt_enable();
+       }
+}
+
+void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+       struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+       clear_bit(bit, &ts->tick_dep_mask);
+}
+
+/*
+ * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
+ * per task timers.
+ */
+void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+       /*
+        * We could optimize this with just kicking the target running the task
+        * if that noise matters for nohz full users.
+        */
+       tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+       clear_bit(bit, &tsk->tick_dep_mask);
+}
+
+/*
+ * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
+ * per process timers.
+ */
+void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+       tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+       clear_bit(bit, &sig->tick_dep_mask);
+}
+
  /*
   * Re-evaluate the need for the tick as we switch the current task.
   * It might need the tick due to per task/process properties:
@@ -263,15 +356,19 @@ void tick_nohz_full_kick_all(void)
  void __tick_nohz_task_switch(void)
  {
         unsigned long flags;
+       struct tick_sched *ts;
  
         local_irq_save(flags);
  
         if (!tick_nohz_full_cpu(smp_processor_id()))
                 goto out;
  
-       if (tick_nohz_tick_stopped() && !can_stop_full_tick())
-               tick_nohz_full_kick();
+       ts = this_cpu_ptr(&tick_cpu_sched);
  
+       if (ts->tick_stopped) {
+               if (current->tick_dep_mask || current->signal->tick_dep_mask)
+                       tick_nohz_full_kick();
+       }
  out:
         local_irq_restore(flags);
  }
@@ -689,7 +786,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
  
                 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                 ts->tick_stopped = 1;
-               trace_tick_stop(1, " ");
+               trace_tick_stop(1, TICK_DEP_MASK_NONE);
         }
  
         /*
@@ -740,7 +837,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
         if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                 return;
  
-       if (can_stop_full_tick())
+       if (can_stop_full_tick(ts))
                 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
         else if (ts->tick_stopped)
                 tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h

index a4a8d4e9baa13f37cbdceff5ffedc3bfc5fd8e4c..eb4e32566a832c27afc6f388121521abab5c5d9b 100644 (file)
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,6 +60,7 @@ struct tick_sched {
         u64                             next_timer;
         ktime_t                         idle_expires;
         int                             do_timer_last;
+       unsigned long                   tick_dep_mask;
  };
  
  extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c

index c9956440d0e609c51e2032f6f625c4061ba764a4..21b81a41dae572a3263c7b731efa97a2d25872e9 100644 (file)
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,7 +30,7 @@
  struct trace_kprobe {
         struct list_head        list;
         struct kretprobe        rp;     /* Use rp.kp for kprobe use */
-       unsigned long           nhit;
+       unsigned long __percpu *nhit;
         const char              *symbol;        /* symbol name */
         struct trace_probe      tp;
  };
@@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
         if (!tk)
                 return ERR_PTR(ret);
  
+       tk->nhit = alloc_percpu(unsigned long);
+       if (!tk->nhit)
+               goto error;
+
         if (symbol) {
                 tk->symbol = kstrdup(symbol, GFP_KERNEL);
                 if (!tk->symbol)
@@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
  error:
         kfree(tk->tp.call.name);
         kfree(tk->symbol);
+       free_percpu(tk->nhit);
         kfree(tk);
         return ERR_PTR(ret);
  }
@@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
         kfree(tk->tp.call.class->system);
         kfree(tk->tp.call.name);
         kfree(tk->symbol);
+       free_percpu(tk->nhit);
         kfree(tk);
  }
  
@@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = {
  static int probes_profile_seq_show(struct seq_file *m, void *v)
  {
         struct trace_kprobe *tk = v;
+       unsigned long nhit = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               nhit += *per_cpu_ptr(tk->nhit, cpu);
  
         seq_printf(m, "  %-44s %15lu %15lu\n",
-                  trace_event_name(&tk->tp.call), tk->nhit,
+                  trace_event_name(&tk->tp.call), nhit,
                    tk->rp.kp.nmissed);
  
         return 0;
@@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
  {
         struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
  
-       tk->nhit++;
+       raw_cpu_inc(*tk->nhit);
  
         if (tk->tp.flags & TP_FLAG_TRACE)
                 kprobe_trace_func(tk, regs);
@@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
  {
         struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
  
-       tk->nhit++;
+       raw_cpu_inc(*tk->nhit);
  
         if (tk->tp.flags & TP_FLAG_TRACE)
                 kretprobe_trace_func(tk, ri, regs);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c

index 0655afbea83f596ded953d6017e0f25a640bc073..d1663083d903aaad3ce4b359cfdb20068004b237 100644 (file)
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
  
  extern char *__bad_type_size(void);
  
-#define SYSCALL_FIELD(type, name)                                      \
-       sizeof(type) != sizeof(trace.name) ?                            \
+#define SYSCALL_FIELD(type, field, name)                               \
+       sizeof(type) != sizeof(trace.field) ?                           \
                 __bad_type_size() :                                     \
-               #type, #name, offsetof(typeof(trace), name),            \
-               sizeof(trace.name), is_signed_type(type)
+               #type, #name, offsetof(typeof(trace), field),           \
+               sizeof(trace.field), is_signed_type(type)
  
  static int __init
  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
         int i;
         int offset = offsetof(typeof(trace), args);
  
-       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+                                FILTER_OTHER);
         if (ret)
                 return ret;
  
@@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call)
         struct syscall_trace_exit trace;
         int ret;
  
-       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+                                FILTER_OTHER);
         if (ret)
                 return ret;
  
-       ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+       ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
                                  FILTER_OTHER);
  
         return ret;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c

index 975cb49e32bf8b1e5dc3085d27c37c312d4ce6bd..f8e26ab963ed2fc057db395bee59d6f25307b5d5 100644 (file)
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
  {
         struct mm_struct *mm;
  
-       /* convert pages-usec to Mbyte-usec */
-       stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-       stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+       /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+       stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+       do_div(stats->coremem, 1000 * KB);
+       stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+       do_div(stats->virtmem, 1000 * KB);
         mm = get_task_mm(p);
         if (mm) {
                 /* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
  static void __acct_update_integrals(struct task_struct *tsk,
                                     cputime_t utime, cputime_t stime)
  {
-       if (likely(tsk->mm)) {
-               cputime_t time, dtime;
-               struct timeval value;
-               unsigned long flags;
-               u64 delta;
-
-               local_irq_save(flags);
-               time = stime + utime;
-               dtime = time - tsk->acct_timexpd;
-               jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-               delta = value.tv_sec;
-               delta = delta * USEC_PER_SEC + value.tv_usec;
-
-               if (delta == 0)
-                       goto out;
-               tsk->acct_timexpd = time;
-               tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-               tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-       out:
-               local_irq_restore(flags);
-       }
+       cputime_t time, dtime;
+       u64 delta;
+
+       if (!likely(tsk->mm))
+               return;
+
+       time = stime + utime;
+       dtime = time - tsk->acct_timexpd;
+       /* Avoid division: cputime_t is often in nanoseconds already. */
+       delta = cputime_to_nsecs(dtime);
+
+       if (delta < TICK_NSEC)
+               return;
+
+       tsk->acct_timexpd = time;
+       /*
+        * Divide by 1024 to avoid overflow, and to avoid division.
+        * The final unit reported to userspace is Mbyte-usecs,
+        * the rest of the math is done in xacct_add_tsk.
+        */
+       tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+       tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
  }
  
  /**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
  void acct_update_integrals(struct task_struct *tsk)
  {
         cputime_t utime, stime;
+       unsigned long flags;
  
+       local_irq_save(flags);
         task_cputime(tsk, &utime, &stime);
         __acct_update_integrals(tsk, utime, stime);
+       local_irq_restore(flags);
  }
  
  /**
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c

index d62de8bf022d2534182fa2bcbdf31b4eb142a8a3..1234818143204a754ec6df5ae8073107c5171dc5 100644 (file)
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -17,7 +17,7 @@
  #include <linux/atomic.h>
  
  #ifdef CONFIG_X86
-#include <asm/processor.h>     /* for boot_cpu_has below */
+#include <asm/cpufeature.h>    /* for boot_cpu_has below */
  #endif
  
  #define TEST(bit, op, c_op, val)                               \
diff --git a/lib/cpumask.c b/lib/cpumask.c

index 5a70f6196f577a071ae0a31e9da7fa0e1dd1bc68..81dedaab36ccfed3150281a0f915a4070f667bfd 100644 (file)
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -41,6 +41,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
                         break;
         return i;
  }
+EXPORT_SYMBOL(cpumask_any_but);
  
  /* These are not inline because of header tangles. */
  #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/lib/list_debug.c b/lib/list_debug.c

index 3345a089ef7b954d475d67965b8ce891c1b16ab0..3859bf63561c63936947b007fe3ee20e822509a1 100644 (file)
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -12,13 +12,6 @@
  #include <linux/kernel.h>
  #include <linux/rculist.h>
  
-static struct list_head force_poison;
-void list_force_poison(struct list_head *entry)
-{
-       entry->next = &force_poison;
-       entry->prev = &force_poison;
-}
-
  /*
   * Insert a new entry between two known consecutive entries.
   *
@@ -30,8 +23,6 @@ void __list_add(struct list_head *new,
                               struct list_head *prev,
                               struct list_head *next)
  {
-       WARN(new->next == &force_poison || new->prev == &force_poison,
-               "list_add attempted on force-poisoned entry\n");
         WARN(next->prev != prev,
                 "list_add corruption. next->prev should be "
                 "prev (%p), but was %p. (next=%p).\n",
diff --git a/lib/test_static_keys.c b/lib/test_static_keys.c

index c61b299e367ffac86450ac59c123d0925285b96c..915d75df20864d07428376750c1bd148c216a585 100644 (file)
--- a/lib/test_static_keys.c
+++ b/lib/test_static_keys.c
@@ -46,8 +46,11 @@ struct test_key {
         bool                    (*test_key)(void);
  };
  
-#define test_key_func(key, branch) \
-       ({bool func(void) { return branch(key); } func; })
+#define test_key_func(key, branch)     \
+static bool key ## _ ## branch(void)   \
+{                                      \
+       return branch(&key);            \
+}
  
  static void invert_key(struct static_key *key)
  {
@@ -92,6 +95,25 @@ static int verify_keys(struct test_key *keys, int size, bool invert)
         return 0;
  }
  
+test_key_func(old_true_key, static_key_true)
+test_key_func(old_false_key, static_key_false)
+test_key_func(true_key, static_branch_likely)
+test_key_func(true_key, static_branch_unlikely)
+test_key_func(false_key, static_branch_likely)
+test_key_func(false_key, static_branch_unlikely)
+test_key_func(base_old_true_key, static_key_true)
+test_key_func(base_inv_old_true_key, static_key_true)
+test_key_func(base_old_false_key, static_key_false)
+test_key_func(base_inv_old_false_key, static_key_false)
+test_key_func(base_true_key, static_branch_likely)
+test_key_func(base_true_key, static_branch_unlikely)
+test_key_func(base_inv_true_key, static_branch_likely)
+test_key_func(base_inv_true_key, static_branch_unlikely)
+test_key_func(base_false_key, static_branch_likely)
+test_key_func(base_false_key, static_branch_unlikely)
+test_key_func(base_inv_false_key, static_branch_likely)
+test_key_func(base_inv_false_key, static_branch_unlikely)
+
  static int __init test_static_key_init(void)
  {
         int ret;
@@ -102,95 +124,95 @@ static int __init test_static_key_init(void)
                 {
                         .init_state     = true,
                         .key            = &old_true_key,
-                       .test_key       = test_key_func(&old_true_key, static_key_true),
+                       .test_key       = &old_true_key_static_key_true,
                 },
                 {
                         .init_state     = false,
                         .key            = &old_false_key,
-                       .test_key       = test_key_func(&old_false_key, static_key_false),
+                       .test_key       = &old_false_key_static_key_false,
                 },
                 /* internal keys - new keys */
                 {
                         .init_state     = true,
                         .key            = &true_key.key,
-                       .test_key       = test_key_func(&true_key, static_branch_likely),
+                       .test_key       = &true_key_static_branch_likely,
                 },
                 {
                         .init_state     = true,
                         .key            = &true_key.key,
-                       .test_key       = test_key_func(&true_key, static_branch_unlikely),
+                       .test_key       = &true_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = false,
                         .key            = &false_key.key,
-                       .test_key       = test_key_func(&false_key, static_branch_likely),
+                       .test_key       = &false_key_static_branch_likely,
                 },
                 {
                         .init_state     = false,
                         .key            = &false_key.key,
-                       .test_key       = test_key_func(&false_key, static_branch_unlikely),
+                       .test_key       = &false_key_static_branch_unlikely,
                 },
                 /* external keys - old keys */
                 {
                         .init_state     = true,
                         .key            = &base_old_true_key,
-                       .test_key       = test_key_func(&base_old_true_key, static_key_true),
+                       .test_key       = &base_old_true_key_static_key_true,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_inv_old_true_key,
-                       .test_key       = test_key_func(&base_inv_old_true_key, static_key_true),
+                       .test_key       = &base_inv_old_true_key_static_key_true,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_old_false_key,
-                       .test_key       = test_key_func(&base_old_false_key, static_key_false),
+                       .test_key       = &base_old_false_key_static_key_false,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_inv_old_false_key,
-                       .test_key       = test_key_func(&base_inv_old_false_key, static_key_false),
+                       .test_key       = &base_inv_old_false_key_static_key_false,
                 },
                 /* external keys - new keys */
                 {
                         .init_state     = true,
                         .key            = &base_true_key.key,
-                       .test_key       = test_key_func(&base_true_key, static_branch_likely),
+                       .test_key       = &base_true_key_static_branch_likely,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_true_key.key,
-                       .test_key       = test_key_func(&base_true_key, static_branch_unlikely),
+                       .test_key       = &base_true_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_inv_true_key.key,
-                       .test_key       = test_key_func(&base_inv_true_key, static_branch_likely),
+                       .test_key       = &base_inv_true_key_static_branch_likely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_inv_true_key.key,
-                       .test_key       = test_key_func(&base_inv_true_key, static_branch_unlikely),
+                       .test_key       = &base_inv_true_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_false_key.key,
-                       .test_key       = test_key_func(&base_false_key, static_branch_likely),
+                       .test_key       = &base_false_key_static_branch_likely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_false_key.key,
-                       .test_key       = test_key_func(&base_false_key, static_branch_unlikely),
+                       .test_key       = &base_false_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_inv_false_key.key,
-                       .test_key       = test_key_func(&base_inv_false_key, static_branch_likely),
+                       .test_key       = &base_inv_false_key_static_branch_likely,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_inv_false_key.key,
-                       .test_key       = test_key_func(&base_inv_false_key, static_branch_unlikely),
+                       .test_key       = &base_inv_false_key_static_branch_unlikely,
                 },
         };
  
diff --git a/mm/filemap.c b/mm/filemap.c

index 3461d97ecb30bfe18d1ed50729e5147aa2ad7e6d..da7a35d83de7edc4ed562caa5e1925194f2c7229 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -195,6 +195,30 @@ void __delete_from_page_cache(struct page *page, void *shadow,
         else
                 cleancache_invalidate_page(mapping, page);
  
+       VM_BUG_ON_PAGE(page_mapped(page), page);
+       if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+               int mapcount;
+
+               pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
+                        current->comm, page_to_pfn(page));
+               dump_page(page, "still mapped when deleted");
+               dump_stack();
+               add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+               mapcount = page_mapcount(page);
+               if (mapping_exiting(mapping) &&
+                   page_count(page) >= mapcount + 2) {
+                       /*
+                        * All vmas have already been torn down, so it's
+                        * a good bet that actually the page is unmapped,
+                        * and we'd prefer not to leak it: if we're wrong,
+                        * some other bad page check should catch it later.
+                        */
+                       page_mapcount_reset(page);
+                       atomic_sub(mapcount, &page->_count);
+               }
+       }
+
         page_cache_tree_delete(mapping, page, shadow);
  
         page->mapping = NULL;
@@ -205,7 +229,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
                 __dec_zone_page_state(page, NR_FILE_PAGES);
         if (PageSwapBacked(page))
                 __dec_zone_page_state(page, NR_SHMEM);
-       VM_BUG_ON_PAGE(page_mapped(page), page);
  
         /*
          * At this point page must be either written or cleaned by truncate.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 01f2b48c8618a9f973eeb11f2162b75bb8cf67d4..aefba5a9cc47f74880dbbb3491a8927fcf999b06 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2751,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
         int ret;
  
         if (!hugepages_supported())
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
  
         table->data = &tmp;
         table->maxlen = sizeof(unsigned long);
@@ -2792,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
         int ret;
  
         if (!hugepages_supported())
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
  
         tmp = h->nr_overcommit_huge_pages;
  
@@ -3502,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
          * COW. Warn that such a situation has occurred as it may not be obvious
          */
         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
-               pr_warning("PID %d killed due to inadequate hugepage pool\n",
+               pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                            current->pid);
                 return ret;
         }
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c

index bc0a8d8b8f42faf7bca01501425ba73e156ee293..1ad20ade8c91328d2f47e8c116f9aca6bc2fb57f 100644 (file)
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -20,6 +20,7 @@
  #include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/kmemleak.h>
+#include <linux/linkage.h>
  #include <linux/memblock.h>
  #include <linux/memory.h>
  #include <linux/mm.h>
@@ -60,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size)
         }
  }
  
+static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+{
+       void *base = task_stack_page(task);
+       size_t size = sp - base;
+
+       kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+       __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+{
+       __kasan_unpoison_stack(current, sp);
+}
  
  /*
   * All functions below always inlined so compiler could
diff --git a/mm/memory.c b/mm/memory.c

index 8132787ae4d509d475ed6a77705076ddfee30630..906d8e3b42c000f0613638aa8a36c32ef1e62951 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1550,9 +1550,30 @@ out:
   */
  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                         unsigned long pfn)
+{
+       return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings.  In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn, pgprot_t pgprot)
  {
         int ret;
-       pgprot_t pgprot = vma->vm_page_prot;
         /*
          * Technically, architectures with pte_special can avoid all these
          * restrictions (same for remap_pfn_range).  However we would like
@@ -1574,7 +1595,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
  
         return ret;
  }
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
  
  int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                         pfn_t pfn)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 4af58a3a8ffa345c16fe190be76ba9f9e83113d4..979b18cbd343e3422d5b4ccb86d1fab2bb1090e2 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -138,7 +138,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
         res->name = "System RAM";
         res->start = start;
         res->end = start + size - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         if (request_resource(&iomem_resource, res) < 0) {
                 pr_debug("System RAM resource %pR cannot be added\n", res);
                 kfree(res);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 4c4187c0e1deeb25bdbf5eb383132d27feb9be90..9a3f6b90e6283b24bff091213768612b04b6d928 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -532,7 +532,7 @@ retry:
                 nid = page_to_nid(page);
                 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                         continue;
-               if (PageTail(page) && PageAnon(page)) {
+               if (PageTransCompound(page) && PageAnon(page)) {
                         get_page(page);
                         pte_unmap_unlock(pte, ptl);
                         lock_page(page);
diff --git a/mm/mempool.c b/mm/mempool.c

index 004d42b1dfaf928ab174e057696afa580447f3d9..7924f4f58a6d48ae5335fdedb1c2e2dd93a56d88 100644 (file)
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool)
         void *element = pool->elements[--pool->curr_nr];
  
         BUG_ON(pool->curr_nr < 0);
-       check_element(pool, element);
         kasan_unpoison_element(pool, element);
+       check_element(pool, element);
         return element;
  }
  
diff --git a/mm/mmap.c b/mm/mmap.c

index 76d1ec29149bf25f5e6ffe3e56e86467efccc641..90e3b869a8b9daed1ed426dd8d2bec0ee950c785 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3066,11 +3066,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
         pgoff_t pgoff;
         struct page **pages;
  
-       if (vma->vm_ops == &legacy_special_mapping_vmops)
+       if (vma->vm_ops == &legacy_special_mapping_vmops) {
                 pages = vma->vm_private_data;
-       else
-               pages = ((struct vm_special_mapping *)vma->vm_private_data)->
-                       pages;
+       } else {
+               struct vm_special_mapping *sm = vma->vm_private_data;
+
+               if (sm->fault)
+                       return sm->fault(sm, vma, vmf);
+
+               pages = sm->pages;
+       }
  
         for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                 pgoff--;
diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh

index d154f0877fd806a93d234e6d4288da27d6c5d09f..7bfe9fa1c8dc6db8d6c4415fbf0d26e66477e42e 100755 (executable)
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -1,7 +1,7 @@
  #!/usr/bin/awk -f
  # extract linker version number from stdin and turn into single number
         {
-       gsub(".*)", "");
+       gsub(".*\\)", "");
         gsub(".*version ", "");
         gsub("-.*", "");
         split($1,a, ".");
diff --git a/scripts/sortextable.c b/scripts/sortextable.c

index c2423d913b46bd0e659ea4d4c057a3af6119c2d4..7b29fb14f870283ae4174e34643640150f7e098b 100644 (file)
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
         return 0;
  }
  
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+       int i;
+
+       i = 0;
+       while (i < image_size) {
+               uint32_t *loc = (uint32_t *)(extab_image + i);
+
+               w(r(loc) + i, loc);
+               w(r(loc + 1) + i + 4, loc + 1);
+               w(r(loc + 2) + i + 8, loc + 2);
+
+               i += sizeof(uint32_t) * 3;
+       }
+
+       qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+       i = 0;
+       while (i < image_size) {
+               uint32_t *loc = (uint32_t *)(extab_image + i);
+
+               w(r(loc) - i, loc);
+               w(r(loc + 1) - (i + 4), loc + 1);
+               w(r(loc + 2) - (i + 8), loc + 2);
+
+               i += sizeof(uint32_t) * 3;
+       }
+}
+
  static void sort_relative_table(char *extab_image, int image_size)
  {
         int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
                 break;
         case EM_386:
         case EM_X86_64:
+               custom_sort = x86_sort_relative_table;
+               break;
+
         case EM_S390:
                 custom_sort = sort_relative_table;
                 break;
diff --git a/sound/arm/pxa2xx-pcm-lib.c b/sound/arm/pxa2xx-pcm-lib.c

index e9b98af6b52c83faecccd4cc1011286a75560849..e8da3b8ee7213352426023029fcff04e87659d50 100644 (file)
--- a/sound/arm/pxa2xx-pcm-lib.c
+++ b/sound/arm/pxa2xx-pcm-lib.c
@@ -141,10 +141,8 @@ int pxa2xx_pcm_mmap(struct snd_pcm_substream *substream,
         struct vm_area_struct *vma)
  {
         struct snd_pcm_runtime *runtime = substream->runtime;
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                    runtime->dma_area,
-                                    runtime->dma_addr,
-                                    runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
  }
  EXPORT_SYMBOL(pxa2xx_pcm_mmap);
  
@@ -156,8 +154,7 @@ int pxa2xx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
         buf->dev.type = SNDRV_DMA_TYPE_DEV;
         buf->dev.dev = pcm->card->dev;
         buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
         if (!buf->area)
                 return -ENOMEM;
         buf->bytes = size;
@@ -178,8 +175,7 @@ void pxa2xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
                 buf = &substream->dma_buffer;
                 if (!buf->area)
                         continue;
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                 buf->area = NULL;
         }
  }
diff --git a/sound/soc/codecs/ab8500-codec.c b/sound/soc/codecs/ab8500-codec.c

index affb192238a403b88dc38f3e76e0c54440779ea0..faae6936bae4f55851e4f0a7062f03fdebf674ee 100644 (file)
--- a/sound/soc/codecs/ab8500-codec.c
+++ b/sound/soc/codecs/ab8500-codec.c
@@ -1130,7 +1130,7 @@ static int sid_status_control_get(struct snd_kcontrol *kcontrol,
         struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
  
         mutex_lock(&drvdata->ctrl_lock);
-       ucontrol->value.integer.value[0] = drvdata->sid_status;
+       ucontrol->value.enumerated.item[0] = drvdata->sid_status;
         mutex_unlock(&drvdata->ctrl_lock);
  
         return 0;
@@ -1147,7 +1147,7 @@ static int sid_status_control_put(struct snd_kcontrol *kcontrol,
  
         dev_dbg(codec->dev, "%s: Enter\n", __func__);
  
-       if (ucontrol->value.integer.value[0] != SID_APPLY_FIR) {
+       if (ucontrol->value.enumerated.item[0] != SID_APPLY_FIR) {
                 dev_err(codec->dev,
                         "%s: ERROR: This control supports '%s' only!\n",
                         __func__, enum_sid_state[SID_APPLY_FIR]);
@@ -1199,7 +1199,7 @@ static int anc_status_control_get(struct snd_kcontrol *kcontrol,
         struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
  
         mutex_lock(&drvdata->ctrl_lock);
-       ucontrol->value.integer.value[0] = drvdata->anc_status;
+       ucontrol->value.enumerated.item[0] = drvdata->anc_status;
         mutex_unlock(&drvdata->ctrl_lock);
  
         return 0;
@@ -1220,7 +1220,7 @@ static int anc_status_control_put(struct snd_kcontrol *kcontrol,
  
         mutex_lock(&drvdata->ctrl_lock);
  
-       req = ucontrol->value.integer.value[0];
+       req = ucontrol->value.enumerated.item[0];
         if (req >= ARRAY_SIZE(enum_anc_state)) {
                 status = -EINVAL;
                 goto cleanup;
diff --git a/sound/soc/codecs/adau17x1.h b/sound/soc/codecs/adau17x1.h

index e13583e6ff56aa5d69a163ca7a447acdaa18cbd6..5ae87a084d97e24a6aa243ba6cb1afc45290ac43 100644 (file)
--- a/sound/soc/codecs/adau17x1.h
+++ b/sound/soc/codecs/adau17x1.h
@@ -103,9 +103,9 @@ bool adau17x1_has_dsp(struct adau *adau);
  #define ADAU17X1_CLOCK_CONTROL_CORECLK_SRC_PLL BIT(3)
  #define ADAU17X1_CLOCK_CONTROL_SYSCLK_EN       BIT(0)
  
-#define ADAU17X1_SERIAL_PORT1_BCLK32           (0x0 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK48           (0x1 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK64           (0x2 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK64           (0x0 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK32           (0x1 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK48           (0x2 << 5)
  #define ADAU17X1_SERIAL_PORT1_BCLK128          (0x3 << 5)
  #define ADAU17X1_SERIAL_PORT1_BCLK256          (0x4 << 5)
  #define ADAU17X1_SERIAL_PORT1_BCLK_MASK                (0x7 << 5)
diff --git a/sound/soc/codecs/cs42l51.c b/sound/soc/codecs/cs42l51.c

index b3951524339f90cdf8f90dbc23cbadd46a874af9..35488f14e2378ada271328a97504a9558b746836 100644 (file)
--- a/sound/soc/codecs/cs42l51.c
+++ b/sound/soc/codecs/cs42l51.c
@@ -60,15 +60,15 @@ static int cs42l51_get_chan_mix(struct snd_kcontrol *kcontrol,
         switch (value) {
         default:
         case 0:
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
                 break;
         /* same value : (L+R)/2 and (R+L)/2 */
         case 1:
         case 2:
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
                 break;
         case 3:
-               ucontrol->value.integer.value[0] = 2;
+               ucontrol->value.enumerated.item[0] = 2;
                 break;
         }
  
@@ -85,7 +85,7 @@ static int cs42l51_set_chan_mix(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         unsigned char val;
  
-       switch (ucontrol->value.integer.value[0]) {
+       switch (ucontrol->value.enumerated.item[0]) {
         default:
         case 0:
                 val = CHAN_MIX_NORMAL;
diff --git a/sound/soc/codecs/da732x.c b/sound/soc/codecs/da732x.c

index 1d5a89c5164b85686f652dc8f5e13af6c1eb49e6..461506a4ca6a2ddb790b1646f27bce2d7abf25ba 100644 (file)
--- a/sound/soc/codecs/da732x.c
+++ b/sound/soc/codecs/da732x.c
@@ -334,7 +334,7 @@ static int da732x_hpf_set(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct soc_enum *enum_ctrl = (struct soc_enum *)kcontrol->private_value;
         unsigned int reg = enum_ctrl->reg;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
         unsigned int bits;
  
         switch (sel) {
@@ -368,13 +368,13 @@ static int da732x_hpf_get(struct snd_kcontrol *kcontrol,
  
         switch (val) {
         case DA732X_HPF_VOICE_EN:
-               ucontrol->value.integer.value[0] = DA732X_HPF_VOICE;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_VOICE;
                 break;
         case DA732X_HPF_MUSIC_EN:
-               ucontrol->value.integer.value[0] = DA732X_HPF_MUSIC;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_MUSIC;
                 break;
         default:
-               ucontrol->value.integer.value[0] = DA732X_HPF_DISABLED;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_DISABLED;
                 break;
         }
  
diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c

index 20dcc496d39c9d8ac01e4d49004f92823fdd37a6..fc22804cabc5942acca6921c034fa98f234041dc 100644 (file)
--- a/sound/soc/codecs/max98088.c
+++ b/sound/soc/codecs/max98088.c
@@ -1496,7 +1496,7 @@ static int max98088_put_eq_enum(struct snd_kcontrol *kcontrol,
         struct max98088_pdata *pdata = max98088->pdata;
         int channel = max98088_get_channel(codec, kcontrol->id.name);
         struct max98088_cdata *cdata;
-       int sel = ucontrol->value.integer.value[0];
+       int sel = ucontrol->value.enumerated.item[0];
  
         if (channel < 0)
                return channel;
diff --git a/sound/soc/codecs/max98095.c b/sound/soc/codecs/max98095.c

index 1fedac50355e954790a15b84ed6aab49b2bb98e3..3577003f39cf8feaba0d97bb313e277b3b763ce7 100644 (file)
--- a/sound/soc/codecs/max98095.c
+++ b/sound/soc/codecs/max98095.c
@@ -1499,7 +1499,7 @@ static int max98095_put_eq_enum(struct snd_kcontrol *kcontrol,
         struct max98095_pdata *pdata = max98095->pdata;
         int channel = max98095_get_eq_channel(kcontrol->id.name);
         struct max98095_cdata *cdata;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
         struct max98095_eq_cfg *coef_set;
         int fs, best, best_val, i;
         int regmask, regsave;
@@ -1653,7 +1653,7 @@ static int max98095_put_bq_enum(struct snd_kcontrol *kcontrol,
         struct max98095_pdata *pdata = max98095->pdata;
         int channel = max98095_get_bq_channel(codec, kcontrol->id.name);
         struct max98095_cdata *cdata;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
         struct max98095_biquad_cfg *coef_set;
         int fs, best, best_val, i;
         int regmask, regsave;
diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c

index 781398fb2841565f20d2f7e7a4f893d840e3e7ae..f7a6ce7e5fb12b7ab183428b28d6c69409676d08 100644 (file)
--- a/sound/soc/codecs/tlv320dac33.c
+++ b/sound/soc/codecs/tlv320dac33.c
@@ -446,7 +446,7 @@ static int dac33_get_fifo_mode(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = dac33->fifo_mode;
+       ucontrol->value.enumerated.item[0] = dac33->fifo_mode;
  
         return 0;
  }
@@ -458,17 +458,16 @@ static int dac33_set_fifo_mode(struct snd_kcontrol *kcontrol,
         struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
         int ret = 0;
  
-       if (dac33->fifo_mode == ucontrol->value.integer.value[0])
+       if (dac33->fifo_mode == ucontrol->value.enumerated.item[0])
                 return 0;
         /* Do not allow changes while stream is running*/
         if (snd_soc_codec_is_active(codec))
                 return -EPERM;
  
-       if (ucontrol->value.integer.value[0] < 0 ||
-           ucontrol->value.integer.value[0] >= DAC33_FIFO_LAST_MODE)
+       if (ucontrol->value.enumerated.item[0] >= DAC33_FIFO_LAST_MODE)
                 ret = -EINVAL;
         else
-               dac33->fifo_mode = ucontrol->value.integer.value[0];
+               dac33->fifo_mode = ucontrol->value.enumerated.item[0];
  
         return ret;
  }
diff --git a/sound/soc/codecs/wl1273.c b/sound/soc/codecs/wl1273.c

index 7693c1129babf0e42c58b5ca93832052374cb426..1b79778098d29ffbb377efc24576825cdfe8038c 100644 (file)
--- a/sound/soc/codecs/wl1273.c
+++ b/sound/soc/codecs/wl1273.c
@@ -175,7 +175,7 @@ static int snd_wl1273_get_audio_route(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = wl1273->mode;
+       ucontrol->value.enumerated.item[0] = wl1273->mode;
  
         return 0;
  }
@@ -193,18 +193,17 @@ static int snd_wl1273_set_audio_route(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
  
-       if (wl1273->mode == ucontrol->value.integer.value[0])
+       if (wl1273->mode == ucontrol->value.enumerated.item[0])
                 return 0;
  
         /* Do not allow changes while stream is running */
         if (snd_soc_codec_is_active(codec))
                 return -EPERM;
  
-       if (ucontrol->value.integer.value[0] < 0 ||
-           ucontrol->value.integer.value[0] >=  ARRAY_SIZE(wl1273_audio_route))
+       if (ucontrol->value.enumerated.item[0] >=  ARRAY_SIZE(wl1273_audio_route))
                 return -EINVAL;
  
-       wl1273->mode = ucontrol->value.integer.value[0];
+       wl1273->mode = ucontrol->value.enumerated.item[0];
  
         return 1;
  }
@@ -219,7 +218,7 @@ static int snd_wl1273_fm_audio_get(struct snd_kcontrol *kcontrol,
  
         dev_dbg(codec->dev, "%s: enter.\n", __func__);
  
-       ucontrol->value.integer.value[0] = wl1273->core->audio_mode;
+       ucontrol->value.enumerated.item[0] = wl1273->core->audio_mode;
  
         return 0;
  }
@@ -233,7 +232,7 @@ static int snd_wl1273_fm_audio_put(struct snd_kcontrol *kcontrol,
  
         dev_dbg(codec->dev, "%s: enter.\n", __func__);
  
-       val = ucontrol->value.integer.value[0];
+       val = ucontrol->value.enumerated.item[0];
         if (wl1273->core->audio_mode == val)
                 return 0;
  
diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c

index 61299ca372ffc1fc3f83dd88fe823151da7b09b8..6f1024f48b193bc7d3127cc8b12d56d14124413b 100644 (file)
--- a/sound/soc/codecs/wm8753.c
+++ b/sound/soc/codecs/wm8753.c
@@ -233,7 +233,7 @@ static int wm8753_get_dai(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = wm8753->dai_func;
+       ucontrol->value.enumerated.item[0] = wm8753->dai_func;
         return 0;
  }
  
@@ -244,7 +244,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
         u16 ioctl;
  
-       if (wm8753->dai_func == ucontrol->value.integer.value[0])
+       if (wm8753->dai_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
         if (snd_soc_codec_is_active(codec))
@@ -252,7 +252,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
  
         ioctl = snd_soc_read(codec, WM8753_IOCTL);
  
-       wm8753->dai_func = ucontrol->value.integer.value[0];
+       wm8753->dai_func = ucontrol->value.enumerated.item[0];
  
         if (((ioctl >> 2) & 0x3) == wm8753->dai_func)
                 return 1;
diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c

index 8172e499e6ed18f33e6f94b2bec1a1647d1c1788..edd7a77091942cd7b873e9c193baed54dd6186c6 100644 (file)
--- a/sound/soc/codecs/wm8904.c
+++ b/sound/soc/codecs/wm8904.c
@@ -396,7 +396,7 @@ static int wm8904_put_drc_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
         struct wm8904_pdata *pdata = wm8904->pdata;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (value >= pdata->num_drc_cfgs)
                 return -EINVAL;
@@ -467,7 +467,7 @@ static int wm8904_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
         struct wm8904_pdata *pdata = wm8904->pdata;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (value >= pdata->num_retune_mobile_cfgs)
                 return -EINVAL;
diff --git a/sound/soc/codecs/wm8958-dsp2.c b/sound/soc/codecs/wm8958-dsp2.c

index c799cca5abeb704645687b863ccedfc6749e9ac2..6b864c0fc2b676cc83f9d03d2218815e7e805919 100644 (file)
--- a/sound/soc/codecs/wm8958-dsp2.c
+++ b/sound/soc/codecs/wm8958-dsp2.c
@@ -459,7 +459,7 @@ static int wm8958_put_mbc_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
@@ -549,7 +549,7 @@ static int wm8958_put_vss_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
@@ -582,7 +582,7 @@ static int wm8958_put_vss_hpf_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
@@ -749,7 +749,7 @@ static int wm8958_put_enh_eq_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
diff --git a/sound/soc/codecs/wm8983.c b/sound/soc/codecs/wm8983.c

index 7350ff654bbf37fe660e2b7a75cc5a15b5d63c68..0c002a5712cb4f261cd84454f38e24a89836519a 100644 (file)
--- a/sound/soc/codecs/wm8983.c
+++ b/sound/soc/codecs/wm8983.c
@@ -497,9 +497,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
  
         reg = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
         if (reg & WM8983_EQ3DMODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
         else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
  
         return 0;
  }
@@ -511,18 +511,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         unsigned int regpwr2, regpwr3;
         unsigned int reg_eq;
  
-       if (ucontrol->value.integer.value[0] != 0
-           && ucontrol->value.integer.value[0] != 1)
+       if (ucontrol->value.enumerated.item[0] != 0
+           && ucontrol->value.enumerated.item[0] != 1)
                 return -EINVAL;
  
         reg_eq = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
         switch ((reg_eq & WM8983_EQ3DMODE) >> WM8983_EQ3DMODE_SHIFT) {
         case 0:
-               if (!ucontrol->value.integer.value[0])
+               if (!ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         case 1:
-               if (ucontrol->value.integer.value[0])
+               if (ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         }
@@ -537,7 +537,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         /* set the desired eqmode */
         snd_soc_update_bits(codec, WM8983_EQ1_LOW_SHELF,
                             WM8983_EQ3DMODE_MASK,
-                           ucontrol->value.integer.value[0]
+                           ucontrol->value.enumerated.item[0]
                             << WM8983_EQ3DMODE_SHIFT);
         /* restore DAC/ADC configuration */
         snd_soc_write(codec, WM8983_POWER_MANAGEMENT_2, regpwr2);
diff --git a/sound/soc/codecs/wm8985.c b/sound/soc/codecs/wm8985.c

index 9918152a03c7518e5c0c2e299a35145abd808a14..6ac76fe116b028db34c8cdbf3691babeb2a68844 100644 (file)
--- a/sound/soc/codecs/wm8985.c
+++ b/sound/soc/codecs/wm8985.c
@@ -531,9 +531,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
  
         reg = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
         if (reg & WM8985_EQ3DMODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
         else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
  
         return 0;
  }
@@ -545,18 +545,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         unsigned int regpwr2, regpwr3;
         unsigned int reg_eq;
  
-       if (ucontrol->value.integer.value[0] != 0
-                       && ucontrol->value.integer.value[0] != 1)
+       if (ucontrol->value.enumerated.item[0] != 0
+                       && ucontrol->value.enumerated.item[0] != 1)
                 return -EINVAL;
  
         reg_eq = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
         switch ((reg_eq & WM8985_EQ3DMODE) >> WM8985_EQ3DMODE_SHIFT) {
         case 0:
-               if (!ucontrol->value.integer.value[0])
+               if (!ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         case 1:
-               if (ucontrol->value.integer.value[0])
+               if (ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         }
@@ -573,7 +573,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         /* set the desired eqmode */
         snd_soc_update_bits(codec, WM8985_EQ1_LOW_SHELF,
                             WM8985_EQ3DMODE_MASK,
-                           ucontrol->value.integer.value[0]
+                           ucontrol->value.enumerated.item[0]
                             << WM8985_EQ3DMODE_SHIFT);
         /* restore DAC/ADC configuration */
         snd_soc_write(codec, WM8985_POWER_MANAGEMENT_2, regpwr2);
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c

index 2ccbb322df775b803d40fe765958f439d855d41f..a18aecb4993590e6a7bd936b238fee544462a6bc 100644 (file)
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -362,7 +362,7 @@ static int wm8994_put_drc_enum(struct snd_kcontrol *kcontrol,
         struct wm8994 *control = wm8994->wm8994;
         struct wm8994_pdata *pdata = &control->pdata;
         int drc = wm8994_get_drc(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (drc < 0)
                 return drc;
@@ -469,7 +469,7 @@ static int wm8994_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
         struct wm8994 *control = wm8994->wm8994;
         struct wm8994_pdata *pdata = &control->pdata;
         int block = wm8994_get_retune_mobile_block(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (block < 0)
                 return block;
diff --git a/sound/soc/codecs/wm8996.c b/sound/soc/codecs/wm8996.c

index 8d7d6c01a2f7264905e7a8e29888a417f44fb515..f99b34f7647b092a657fbbc3a70fd01219837351 100644 (file)
--- a/sound/soc/codecs/wm8996.c
+++ b/sound/soc/codecs/wm8996.c
@@ -416,7 +416,7 @@ static int wm8996_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
         struct wm8996_priv *wm8996 = snd_soc_codec_get_drvdata(codec);
         struct wm8996_pdata *pdata = &wm8996->pdata;
         int block = wm8996_get_retune_mobile_block(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (block < 0)
                 return block;
diff --git a/sound/soc/codecs/wm9081.c b/sound/soc/codecs/wm9081.c

index ccb3b15139ad9dacd41ad801201471d1ed696997..363b3b6676163e8c0b110efd165c5eccd7ec9e1a 100644 (file)
--- a/sound/soc/codecs/wm9081.c
+++ b/sound/soc/codecs/wm9081.c
@@ -344,9 +344,9 @@ static int speaker_mode_get(struct snd_kcontrol *kcontrol,
  
         reg = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
         if (reg & WM9081_SPK_MODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
         else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
  
         return 0;
  }
@@ -365,7 +365,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
         unsigned int reg2 = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
  
         /* Are we changing anything? */
-       if (ucontrol->value.integer.value[0] ==
+       if (ucontrol->value.enumerated.item[0] ==
             ((reg2 & WM9081_SPK_MODE) != 0))
                 return 0;
  
@@ -373,7 +373,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
         if (reg_pwr & WM9081_SPK_ENA)
                 return -EINVAL;
  
-       if (ucontrol->value.integer.value[0]) {
+       if (ucontrol->value.enumerated.item[0]) {
                 /* Class AB */
                 reg2 &= ~(WM9081_SPK_INV_MUTE | WM9081_OUT_SPK_CTRL);
                 reg2 |= WM9081_SPK_MODE;
diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c

index 79e143625ac3d9c14ca72f96dc11acb6a970efdb..9849643ef8099c7f3340a719cf160c8ce633f9c5 100644 (file)
--- a/sound/soc/codecs/wm9713.c
+++ b/sound/soc/codecs/wm9713.c
@@ -1212,7 +1212,7 @@ static int wm9713_soc_probe(struct snd_soc_codec *codec)
         if (IS_ERR(wm9713->ac97))
                 return PTR_ERR(wm9713->ac97);
  
-       regmap = devm_regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
+       regmap = regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
         if (IS_ERR(regmap)) {
                 snd_soc_free_ac97_codec(wm9713->ac97);
                 return PTR_ERR(regmap);
diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c

index 33806d487b8ae00711dddd4ec400393dbff59b1f..b9195b9c2b05175278d4e779f7db87aad20d9f63 100644 (file)
--- a/sound/soc/codecs/wm_adsp.c
+++ b/sound/soc/codecs/wm_adsp.c
@@ -586,7 +586,7 @@ static int wm_adsp_fw_get(struct snd_kcontrol *kcontrol,
         struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
         struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = dsp[e->shift_l].fw;
+       ucontrol->value.enumerated.item[0] = dsp[e->shift_l].fw;
  
         return 0;
  }
@@ -599,10 +599,10 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
         struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
         int ret = 0;
  
-       if (ucontrol->value.integer.value[0] == dsp[e->shift_l].fw)
+       if (ucontrol->value.enumerated.item[0] == dsp[e->shift_l].fw)
                 return 0;
  
-       if (ucontrol->value.integer.value[0] >= WM_ADSP_NUM_FW)
+       if (ucontrol->value.enumerated.item[0] >= WM_ADSP_NUM_FW)
                 return -EINVAL;
  
         mutex_lock(&dsp[e->shift_l].pwr_lock);
@@ -610,7 +610,7 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
         if (dsp[e->shift_l].running || dsp[e->shift_l].compr)
                 ret = -EBUSY;
         else
-               dsp[e->shift_l].fw = ucontrol->value.integer.value[0];
+               dsp[e->shift_l].fw = ucontrol->value.enumerated.item[0];
  
         mutex_unlock(&dsp[e->shift_l].pwr_lock);
  
diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c

index ed8de1035cda159d0d186f2cded0fb7a97fbceb4..40dfd8a3648408a2cc76bb6e4c4c3872c8e28ac3 100644 (file)
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -112,6 +112,20 @@ struct fsl_ssi_rxtx_reg_val {
         struct fsl_ssi_reg_val tx;
  };
  
+static const struct reg_default fsl_ssi_reg_defaults[] = {
+       {CCSR_SSI_SCR,     0x00000000},
+       {CCSR_SSI_SIER,    0x00003003},
+       {CCSR_SSI_STCR,    0x00000200},
+       {CCSR_SSI_SRCR,    0x00000200},
+       {CCSR_SSI_STCCR,   0x00040000},
+       {CCSR_SSI_SRCCR,   0x00040000},
+       {CCSR_SSI_SACNT,   0x00000000},
+       {CCSR_SSI_STMSK,   0x00000000},
+       {CCSR_SSI_SRMSK,   0x00000000},
+       {CCSR_SSI_SACCEN,  0x00000000},
+       {CCSR_SSI_SACCDIS, 0x00000000},
+};
+
  static bool fsl_ssi_readable_reg(struct device *dev, unsigned int reg)
  {
         switch (reg) {
@@ -176,7 +190,8 @@ static const struct regmap_config fsl_ssi_regconfig = {
         .val_bits = 32,
         .reg_stride = 4,
         .val_format_endian = REGMAP_ENDIAN_NATIVE,
-       .num_reg_defaults_raw = CCSR_SSI_SACCDIS / sizeof(uint32_t) + 1,
+       .reg_defaults = fsl_ssi_reg_defaults,
+       .num_reg_defaults = ARRAY_SIZE(fsl_ssi_reg_defaults),
         .readable_reg = fsl_ssi_readable_reg,
         .volatile_reg = fsl_ssi_volatile_reg,
         .precious_reg = fsl_ssi_precious_reg,
@@ -186,7 +201,6 @@ static const struct regmap_config fsl_ssi_regconfig = {
  
  struct fsl_ssi_soc_data {
         bool imx;
-       bool imx21regs; /* imx21-class SSI - no SACC{ST,EN,DIS} regs */
         bool offline_config;
         u32 sisr_write_mask;
  };
@@ -289,7 +303,6 @@ static struct fsl_ssi_soc_data fsl_ssi_mpc8610 = {
  
  static struct fsl_ssi_soc_data fsl_ssi_imx21 = {
         .imx = true,
-       .imx21regs = true,
         .offline_config = true,
         .sisr_write_mask = 0,
  };
@@ -573,12 +586,8 @@ static void fsl_ssi_setup_ac97(struct fsl_ssi_private *ssi_private)
          */
         regmap_write(regs, CCSR_SSI_SACNT,
                         CCSR_SSI_SACNT_AC97EN | CCSR_SSI_SACNT_FV);
-
-       /* no SACC{ST,EN,DIS} regs on imx21-class SSI */
-       if (!ssi_private->soc->imx21regs) {
-               regmap_write(regs, CCSR_SSI_SACCDIS, 0xff);
-               regmap_write(regs, CCSR_SSI_SACCEN, 0x300);
-       }
+       regmap_write(regs, CCSR_SSI_SACCDIS, 0xff);
+       regmap_write(regs, CCSR_SSI_SACCEN, 0x300);
  
         /*
          * Enable SSI, Transmit and Receive. AC97 has to communicate with the
@@ -1388,7 +1397,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
         struct resource *res;
         void __iomem *iomem;
         char name[64];
-       struct regmap_config regconfig = fsl_ssi_regconfig;
  
         of_id = of_match_device(fsl_ssi_ids, &pdev->dev);
         if (!of_id || !of_id->data)
@@ -1436,25 +1444,15 @@ static int fsl_ssi_probe(struct platform_device *pdev)
                 return PTR_ERR(iomem);
         ssi_private->ssi_phys = res->start;
  
-       if (ssi_private->soc->imx21regs) {
-               /*
-                * According to datasheet imx21-class SSI
-                * don't have SACC{ST,EN,DIS} regs.
-                */
-               regconfig.max_register = CCSR_SSI_SRMSK;
-               regconfig.num_reg_defaults_raw =
-                       CCSR_SSI_SRMSK / sizeof(uint32_t) + 1;
-       }
-
         ret = of_property_match_string(np, "clock-names", "ipg");
         if (ret < 0) {
                 ssi_private->has_ipg_clk_name = false;
                 ssi_private->regs = devm_regmap_init_mmio(&pdev->dev, iomem,
-                       &regconfig);
+                       &fsl_ssi_regconfig);
         } else {
                 ssi_private->has_ipg_clk_name = true;
                 ssi_private->regs = devm_regmap_init_mmio_clk(&pdev->dev,
-                       "ipg", iomem, &regconfig);
+                       "ipg", iomem, &fsl_ssi_regconfig);
         }
         if (IS_ERR(ssi_private->regs)) {
                 dev_err(&pdev->dev, "Failed to init register map\n");
diff --git a/sound/soc/fsl/imx-pcm-fiq.c b/sound/soc/fsl/imx-pcm-fiq.c

index 49d7513f429e5424ee3ea081178e087368e74896..e63cd5ecfd8feacd396f835756af90107fd2e2a6 100644 (file)
--- a/sound/soc/fsl/imx-pcm-fiq.c
+++ b/sound/soc/fsl/imx-pcm-fiq.c
@@ -217,8 +217,8 @@ static int snd_imx_pcm_mmap(struct snd_pcm_substream *substream,
         struct snd_pcm_runtime *runtime = substream->runtime;
         int ret;
  
-       ret = dma_mmap_writecombine(substream->pcm->card->dev, vma,
-               runtime->dma_area, runtime->dma_addr, runtime->dma_bytes);
+       ret = dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                         runtime->dma_addr, runtime->dma_bytes);
  
         pr_debug("%s: ret: %d %p %pad 0x%08x\n", __func__, ret,
                         runtime->dma_area,
@@ -247,8 +247,7 @@ static int imx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
         buf->dev.type = SNDRV_DMA_TYPE_DEV;
         buf->dev.dev = pcm->card->dev;
         buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
         if (!buf->area)
                 return -ENOMEM;
         buf->bytes = size;
@@ -330,8 +329,7 @@ static void imx_pcm_free(struct snd_pcm *pcm)
                 if (!buf->area)
                         continue;
  
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                 buf->area = NULL;
         }
  }
diff --git a/sound/soc/intel/boards/cht_bsw_rt5645.c b/sound/soc/intel/boards/cht_bsw_rt5645.c

index 2d3afddb0a2e8c83fef54b36eed42d7ff6b69878..a7b96a9a4e0ecb57876413c848cfa91359d42a92 100644 (file)
--- a/sound/soc/intel/boards/cht_bsw_rt5645.c
+++ b/sound/soc/intel/boards/cht_bsw_rt5645.c
@@ -367,8 +367,12 @@ static int snd_cht_mc_probe(struct platform_device *pdev)
         }
         card->dev = &pdev->dev;
         sprintf(codec_name, "i2c-%s:00", drv->acpi_card->codec_id);
+
         /* set correct codec name */
-       strcpy((char *)card->dai_link[2].codec_name, codec_name);
+       for (i = 0; i < ARRAY_SIZE(cht_dailink); i++)
+               if (!strcmp(card->dai_link[i].codec_name, "i2c-10EC5645:00"))
+                       card->dai_link[i].codec_name = kstrdup(codec_name, GFP_KERNEL);
+
         snd_soc_card_set_drvdata(card, drv);
         ret_val = devm_snd_soc_register_card(&pdev->dev, card);
         if (ret_val) {
diff --git a/sound/soc/intel/boards/mfld_machine.c b/sound/soc/intel/boards/mfld_machine.c

index 49c09a0add797ff95101bee1923b62254e3928a0..34f46c72a0e27cff8cd37108d27b71029cd088c6 100644 (file)
--- a/sound/soc/intel/boards/mfld_machine.c
+++ b/sound/soc/intel/boards/mfld_machine.c
@@ -94,7 +94,7 @@ static const struct soc_enum lo_enum =
  static int headset_get_switch(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = hs_switch;
+       ucontrol->value.enumerated.item[0] = hs_switch;
         return 0;
  }
  
@@ -104,12 +104,12 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
         struct snd_soc_dapm_context *dapm = &card->dapm;
  
-       if (ucontrol->value.integer.value[0] == hs_switch)
+       if (ucontrol->value.enumerated.item[0] == hs_switch)
                 return 0;
  
         snd_soc_dapm_mutex_lock(dapm);
  
-       if (ucontrol->value.integer.value[0]) {
+       if (ucontrol->value.enumerated.item[0]) {
                 pr_debug("hs_set HS path\n");
                 snd_soc_dapm_enable_pin_unlocked(dapm, "Headphones");
                 snd_soc_dapm_disable_pin_unlocked(dapm, "EPOUT");
@@ -123,7 +123,7 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
  
         snd_soc_dapm_mutex_unlock(dapm);
  
-       hs_switch = ucontrol->value.integer.value[0];
+       hs_switch = ucontrol->value.enumerated.item[0];
  
         return 0;
  }
@@ -148,7 +148,7 @@ static void lo_enable_out_pins(struct snd_soc_dapm_context *dapm)
  static int lo_get_switch(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = lo_dac;
+       ucontrol->value.enumerated.item[0] = lo_dac;
         return 0;
  }
  
@@ -158,7 +158,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
         struct snd_soc_dapm_context *dapm = &card->dapm;
  
-       if (ucontrol->value.integer.value[0] == lo_dac)
+       if (ucontrol->value.enumerated.item[0] == lo_dac)
                 return 0;
  
         snd_soc_dapm_mutex_lock(dapm);
@@ -168,7 +168,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
          */
         lo_enable_out_pins(dapm);
  
-       switch (ucontrol->value.integer.value[0]) {
+       switch (ucontrol->value.enumerated.item[0]) {
         case 0:
                 pr_debug("set vibra path\n");
                 snd_soc_dapm_disable_pin_unlocked(dapm, "VIB1OUT");
@@ -202,7 +202,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
  
         snd_soc_dapm_mutex_unlock(dapm);
  
-       lo_dac = ucontrol->value.integer.value[0];
+       lo_dac = ucontrol->value.enumerated.item[0];
         return 0;
  }
  
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c

index a294fee431f07363f965a81b4c9ef42eb3a42f58..5a4837dcfce3e615eedf47049ac3923332c8a9bf 100644 (file)
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -978,7 +978,7 @@ static int skl_tplg_tlv_control_set(struct snd_kcontrol *kcontrol,
                                 return -EFAULT;
                 } else {
                         if (copy_from_user(ac->params,
-                                          data + 2 * sizeof(u32), size))
+                                          data + 2, size))
                                 return -EFAULT;
                 }
  
diff --git a/sound/soc/nuc900/nuc900-pcm.c b/sound/soc/nuc900/nuc900-pcm.c

index e09326158bc2dd0be61ea69f375f37dc6f0cdbe5..2cca055fd806cdb80e7226557af943533680c2f8 100644 (file)
--- a/sound/soc/nuc900/nuc900-pcm.c
+++ b/sound/soc/nuc900/nuc900-pcm.c
@@ -267,10 +267,8 @@ static int nuc900_dma_mmap(struct snd_pcm_substream *substream,
  {
         struct snd_pcm_runtime *runtime = substream->runtime;
  
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                       runtime->dma_area,
-                                       runtime->dma_addr,
-                                       runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
  }
  
  static struct snd_pcm_ops nuc900_dma_ops = {
diff --git a/sound/soc/omap/n810.c b/sound/soc/omap/n810.c

index 190f868e78b24af37d8442d378c06aeb8ec6aaf2..fdecb70431745bf23c7113bf7abcede1cd5306ef 100644 (file)
--- a/sound/soc/omap/n810.c
+++ b/sound/soc/omap/n810.c
@@ -133,7 +133,7 @@ static struct snd_soc_ops n810_ops = {
  static int n810_get_spk(struct snd_kcontrol *kcontrol,
                         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = n810_spk_func;
+       ucontrol->value.enumerated.item[0] = n810_spk_func;
  
         return 0;
  }
@@ -143,10 +143,10 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (n810_spk_func == ucontrol->value.integer.value[0])
+       if (n810_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       n810_spk_func = ucontrol->value.integer.value[0];
+       n810_spk_func = ucontrol->value.enumerated.item[0];
         n810_ext_control(&card->dapm);
  
         return 1;
@@ -155,7 +155,7 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
  static int n810_get_jack(struct snd_kcontrol *kcontrol,
                          struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = n810_jack_func;
+       ucontrol->value.enumerated.item[0] = n810_jack_func;
  
         return 0;
  }
@@ -165,10 +165,10 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (n810_jack_func == ucontrol->value.integer.value[0])
+       if (n810_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       n810_jack_func = ucontrol->value.integer.value[0];
+       n810_jack_func = ucontrol->value.enumerated.item[0];
         n810_ext_control(&card->dapm);
  
         return 1;
@@ -177,7 +177,7 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
  static int n810_get_input(struct snd_kcontrol *kcontrol,
                           struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = n810_dmic_func;
+       ucontrol->value.enumerated.item[0] = n810_dmic_func;
  
         return 0;
  }
@@ -187,10 +187,10 @@ static int n810_set_input(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (n810_dmic_func == ucontrol->value.integer.value[0])
+       if (n810_dmic_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       n810_dmic_func = ucontrol->value.integer.value[0];
+       n810_dmic_func = ucontrol->value.enumerated.item[0];
         n810_ext_control(&card->dapm);
  
         return 1;
diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c

index 6bb623a2a4dfcfa3cda744ca8bef18575baec2d9..99381a27295bbbd9452f0d0050c1509651373bff 100644 (file)
--- a/sound/soc/omap/omap-pcm.c
+++ b/sound/soc/omap/omap-pcm.c
@@ -156,10 +156,8 @@ static int omap_pcm_mmap(struct snd_pcm_substream *substream,
  {
         struct snd_pcm_runtime *runtime = substream->runtime;
  
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                    runtime->dma_area,
-                                    runtime->dma_addr,
-                                    runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
  }
  
  static struct snd_pcm_ops omap_pcm_ops = {
@@ -183,8 +181,7 @@ static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
         buf->dev.type = SNDRV_DMA_TYPE_DEV;
         buf->dev.dev = pcm->card->dev;
         buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
         if (!buf->area)
                 return -ENOMEM;
  
@@ -207,8 +204,7 @@ static void omap_pcm_free_dma_buffers(struct snd_pcm *pcm)
                 if (!buf->area)
                         continue;
  
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                 buf->area = NULL;
         }
  }
diff --git a/sound/soc/omap/rx51.c b/sound/soc/omap/rx51.c

index 5e21f08579d804c5f7895a22e87de6cdff8b79ef..54949242bc7075587e6a73a7235049db2e108734 100644 (file)
--- a/sound/soc/omap/rx51.c
+++ b/sound/soc/omap/rx51.c
@@ -132,7 +132,7 @@ static struct snd_soc_ops rx51_ops = {
  static int rx51_get_spk(struct snd_kcontrol *kcontrol,
                         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = rx51_spk_func;
+       ucontrol->value.enumerated.item[0] = rx51_spk_func;
  
         return 0;
  }
@@ -142,10 +142,10 @@ static int rx51_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (rx51_spk_func == ucontrol->value.integer.value[0])
+       if (rx51_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       rx51_spk_func = ucontrol->value.integer.value[0];
+       rx51_spk_func = ucontrol->value.enumerated.item[0];
         rx51_ext_control(&card->dapm);
  
         return 1;
@@ -180,7 +180,7 @@ static int rx51_hp_event(struct snd_soc_dapm_widget *w,
  static int rx51_get_input(struct snd_kcontrol *kcontrol,
                           struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = rx51_dmic_func;
+       ucontrol->value.enumerated.item[0] = rx51_dmic_func;
  
         return 0;
  }
@@ -190,10 +190,10 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (rx51_dmic_func == ucontrol->value.integer.value[0])
+       if (rx51_dmic_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       rx51_dmic_func = ucontrol->value.integer.value[0];
+       rx51_dmic_func = ucontrol->value.enumerated.item[0];
         rx51_ext_control(&card->dapm);
  
         return 1;
@@ -202,7 +202,7 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
  static int rx51_get_jack(struct snd_kcontrol *kcontrol,
                          struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = rx51_jack_func;
+       ucontrol->value.enumerated.item[0] = rx51_jack_func;
  
         return 0;
  }
@@ -212,10 +212,10 @@ static int rx51_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (rx51_jack_func == ucontrol->value.integer.value[0])
+       if (rx51_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       rx51_jack_func = ucontrol->value.integer.value[0];
+       rx51_jack_func = ucontrol->value.enumerated.item[0];
         rx51_ext_control(&card->dapm);
  
         return 1;
diff --git a/sound/soc/pxa/corgi.c b/sound/soc/pxa/corgi.c

index c97dc13d36087628433de516a1b2217df02b492b..dcbb7aa9830c0347af6f7bd2c198639e52d9e24e 100644 (file)
--- a/sound/soc/pxa/corgi.c
+++ b/sound/soc/pxa/corgi.c
@@ -163,7 +163,7 @@ static struct snd_soc_ops corgi_ops = {
  static int corgi_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = corgi_jack_func;
+       ucontrol->value.enumerated.item[0] = corgi_jack_func;
         return 0;
  }
  
@@ -172,10 +172,10 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (corgi_jack_func == ucontrol->value.integer.value[0])
+       if (corgi_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       corgi_jack_func = ucontrol->value.integer.value[0];
+       corgi_jack_func = ucontrol->value.enumerated.item[0];
         corgi_ext_control(&card->dapm);
         return 1;
  }
@@ -183,7 +183,7 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
  static int corgi_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = corgi_spk_func;
+       ucontrol->value.enumerated.item[0] = corgi_spk_func;
         return 0;
  }
  
@@ -192,10 +192,10 @@ static int corgi_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (corgi_spk_func == ucontrol->value.integer.value[0])
+       if (corgi_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       corgi_spk_func = ucontrol->value.integer.value[0];
+       corgi_spk_func = ucontrol->value.enumerated.item[0];
         corgi_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/pxa/magician.c b/sound/soc/pxa/magician.c

index 241d0be42d7a0c6779bb6459c5165728cbedb302..62b8377a9d2b93c81f06a8df4c0b5ed2cd7a567f 100644 (file)
--- a/sound/soc/pxa/magician.c
+++ b/sound/soc/pxa/magician.c
@@ -308,17 +308,17 @@ static int magician_set_spk(struct snd_kcontrol *kcontrol,
  static int magician_get_input(struct snd_kcontrol *kcontrol,
                               struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = magician_in_sel;
+       ucontrol->value.enumerated.item[0] = magician_in_sel;
         return 0;
  }
  
  static int magician_set_input(struct snd_kcontrol *kcontrol,
                               struct snd_ctl_elem_value *ucontrol)
  {
-       if (magician_in_sel == ucontrol->value.integer.value[0])
+       if (magician_in_sel == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       magician_in_sel = ucontrol->value.integer.value[0];
+       magician_in_sel = ucontrol->value.enumerated.item[0];
  
         switch (magician_in_sel) {
         case MAGICIAN_MIC:
diff --git a/sound/soc/pxa/poodle.c b/sound/soc/pxa/poodle.c

index 84d0e2e508088a260b54911bca03236a50046203..4b3b714f5ee7a67e5538b191a9da99bac86e56f8 100644 (file)
--- a/sound/soc/pxa/poodle.c
+++ b/sound/soc/pxa/poodle.c
@@ -138,7 +138,7 @@ static struct snd_soc_ops poodle_ops = {
  static int poodle_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = poodle_jack_func;
+       ucontrol->value.enumerated.item[0] = poodle_jack_func;
         return 0;
  }
  
@@ -147,10 +147,10 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (poodle_jack_func == ucontrol->value.integer.value[0])
+       if (poodle_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       poodle_jack_func = ucontrol->value.integer.value[0];
+       poodle_jack_func = ucontrol->value.enumerated.item[0];
         poodle_ext_control(&card->dapm);
         return 1;
  }
@@ -158,7 +158,7 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
  static int poodle_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = poodle_spk_func;
+       ucontrol->value.enumerated.item[0] = poodle_spk_func;
         return 0;
  }
  
@@ -167,10 +167,10 @@ static int poodle_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (poodle_spk_func == ucontrol->value.integer.value[0])
+       if (poodle_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       poodle_spk_func = ucontrol->value.integer.value[0];
+       poodle_spk_func = ucontrol->value.enumerated.item[0];
         poodle_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/pxa/spitz.c b/sound/soc/pxa/spitz.c

index b00222620fd01be5cf00d4118d44f18832ad3bdc..0e02634c8b7f6f41764b75344ae850ff7ce586d7 100644 (file)
--- a/sound/soc/pxa/spitz.c
+++ b/sound/soc/pxa/spitz.c
@@ -164,7 +164,7 @@ static struct snd_soc_ops spitz_ops = {
  static int spitz_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = spitz_jack_func;
+       ucontrol->value.enumerated.item[0] = spitz_jack_func;
         return 0;
  }
  
@@ -173,10 +173,10 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (spitz_jack_func == ucontrol->value.integer.value[0])
+       if (spitz_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       spitz_jack_func = ucontrol->value.integer.value[0];
+       spitz_jack_func = ucontrol->value.enumerated.item[0];
         spitz_ext_control(&card->dapm);
         return 1;
  }
@@ -184,7 +184,7 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
  static int spitz_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = spitz_spk_func;
+       ucontrol->value.enumerated.item[0] = spitz_spk_func;
         return 0;
  }
  
@@ -193,10 +193,10 @@ static int spitz_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (spitz_spk_func == ucontrol->value.integer.value[0])
+       if (spitz_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       spitz_spk_func = ucontrol->value.integer.value[0];
+       spitz_spk_func = ucontrol->value.enumerated.item[0];
         spitz_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/pxa/tosa.c b/sound/soc/pxa/tosa.c

index 49518dd642aa18ffd3f0004fe2f425210e820b54..c508f024ecfbc206887aeb9f91943311a279aed7 100644 (file)
--- a/sound/soc/pxa/tosa.c
+++ b/sound/soc/pxa/tosa.c
@@ -95,7 +95,7 @@ static struct snd_soc_ops tosa_ops = {
  static int tosa_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = tosa_jack_func;
+       ucontrol->value.enumerated.item[0] = tosa_jack_func;
         return 0;
  }
  
@@ -104,10 +104,10 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (tosa_jack_func == ucontrol->value.integer.value[0])
+       if (tosa_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       tosa_jack_func = ucontrol->value.integer.value[0];
+       tosa_jack_func = ucontrol->value.enumerated.item[0];
         tosa_ext_control(&card->dapm);
         return 1;
  }
@@ -115,7 +115,7 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
  static int tosa_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = tosa_spk_func;
+       ucontrol->value.enumerated.item[0] = tosa_spk_func;
         return 0;
  }
  
@@ -124,10 +124,10 @@ static int tosa_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (tosa_spk_func == ucontrol->value.integer.value[0])
+       if (tosa_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       tosa_spk_func = ucontrol->value.integer.value[0];
+       tosa_spk_func = ucontrol->value.enumerated.item[0];
         tosa_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/qcom/lpass-cpu.c b/sound/soc/qcom/lpass-cpu.c

index 00b6c9d039cfada651dad6e917be9318dc4ce762..e5101e0d2d372262f8ecf7d0498240cba971b86f 100644 (file)
--- a/sound/soc/qcom/lpass-cpu.c
+++ b/sound/soc/qcom/lpass-cpu.c
@@ -355,7 +355,6 @@ static struct regmap_config lpass_cpu_regmap_config = {
         .readable_reg = lpass_cpu_regmap_readable,
         .volatile_reg = lpass_cpu_regmap_volatile,
         .cache_type = REGCACHE_FLAT,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
  };
  
  int asoc_qcom_lpass_cpu_platform_probe(struct platform_device *pdev)
diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c

index 84d9e77c0fbe1e8d5e737d26beb492c3ce85883b..70a2559b63f9050bcbb86049717518d022463e2f 100644 (file)
--- a/sound/soc/samsung/i2s.c
+++ b/sound/soc/samsung/i2s.c
@@ -481,10 +481,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
         unsigned int cdcon_mask = 1 << i2s_regs->cdclkcon_off;
         unsigned int rsrc_mask = 1 << i2s_regs->rclksrc_off;
         u32 mod, mask, val = 0;
+       unsigned long flags;
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         switch (clk_id) {
         case SAMSUNG_I2S_OPCLK:
@@ -575,11 +576,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
                 return -EINVAL;
         }
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
         mod = (mod & ~mask) | val;
         writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         return 0;
  }
@@ -590,6 +591,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
         struct i2s_dai *i2s = to_info(dai);
         int lrp_shift, sdf_shift, sdf_mask, lrp_rlow, mod_slave;
         u32 mod, tmp = 0;
+       unsigned long flags;
  
         lrp_shift = i2s->variant_regs->lrp_off;
         sdf_shift = i2s->variant_regs->sdf_off;
@@ -649,7 +651,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
                 return -EINVAL;
         }
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
         /*
          * Don't change the I2S mode if any controller is active on this
@@ -657,7 +659,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
          */
         if (any_active(i2s) &&
                 ((mod & (sdf_mask | lrp_rlow | mod_slave)) != tmp)) {
-               spin_unlock(i2s->lock);
+               spin_unlock_irqrestore(i2s->lock, flags);
                 dev_err(&i2s->pdev->dev,
                                 "%s:%d Other DAI busy\n", __func__, __LINE__);
                 return -EAGAIN;
@@ -666,7 +668,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
         mod &= ~(sdf_mask | lrp_rlow | mod_slave);
         mod |= tmp;
         writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         return 0;
  }
@@ -676,6 +678,7 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
  {
         struct i2s_dai *i2s = to_info(dai);
         u32 mod, mask = 0, val = 0;
+       unsigned long flags;
  
         if (!is_secondary(i2s))
                 mask |= (MOD_DC2_EN | MOD_DC1_EN);
@@ -744,11 +747,11 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
                 return -EINVAL;
         }
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
         mod = (mod & ~mask) | val;
         writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         samsung_asoc_init_dma_data(dai, &i2s->dma_playback, &i2s->dma_capture);
  
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c

index 0d37079879002d472c538b69ccc3321c0d7772ad..581175a51ecf730bb8c6a2271d137f049c2a445d 100644 (file)
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -3573,7 +3573,7 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol);
  
-       ucontrol->value.integer.value[0] = w->params_select;
+       ucontrol->value.enumerated.item[0] = w->params_select;
  
         return 0;
  }
@@ -3587,13 +3587,13 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
         if (w->power)
                 return -EBUSY;
  
-       if (ucontrol->value.integer.value[0] == w->params_select)
+       if (ucontrol->value.enumerated.item[0] == w->params_select)
                 return 0;
  
-       if (ucontrol->value.integer.value[0] >= w->num_params)
+       if (ucontrol->value.enumerated.item[0] >= w->num_params)
                 return -EINVAL;
  
-       w->params_select = ucontrol->value.integer.value[0];
+       w->params_select = ucontrol->value.enumerated.item[0];
  
         return 0;
  }
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build

index 4a96473b180f938b038d5e698f3578876eb7c70c..ee566e8bd1cff56efde9200bf4a76caa840669f1 100644 (file)
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -85,7 +85,7 @@ $(OUTPUT)%.i: %.c FORCE
         $(call rule_mkdir)
         $(call if_changed_dep,cc_i_c)
  
-$(OUTPUT)%.i: %.S FORCE
+$(OUTPUT)%.s: %.S FORCE
         $(call rule_mkdir)
         $(call if_changed_dep,cc_i_c)
  
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature

index 02db3cdff20ff7653c8ca0db21d28ef7b508a9eb..6b7707270aa3b19791c8b6248f90ecddeabc1fdd 100644 (file)
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -27,7 +27,7 @@ endef
  #   the rule that uses them - an example for that is the 'bionic'
  #   feature check. ]
  #
-FEATURE_TESTS ?=                       \
+FEATURE_TESTS_BASIC :=                 \
         backtrace                       \
         dwarf                           \
         fortify-source                  \
@@ -46,6 +46,7 @@ FEATURE_TESTS ?=                      \
         libpython                       \
         libpython-version               \
         libslang                        \
+       libcrypto                       \
         libunwind                       \
         pthread-attr-setaffinity-np     \
         stackprotector-all              \
@@ -56,6 +57,25 @@ FEATURE_TESTS ?=                     \
         get_cpuid                       \
         bpf
  
+# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
+# of all feature tests
+FEATURE_TESTS_EXTRA :=                 \
+       bionic                          \
+       compile-32                      \
+       compile-x32                     \
+       cplus-demangle                  \
+       hello                           \
+       libbabeltrace                   \
+       liberty                         \
+       liberty-z                       \
+       libunwind-debug-frame
+
+FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC)
+
+ifeq ($(FEATURE_TESTS),all)
+  FEATURE_TESTS := $(FEATURE_TESTS_BASIC) $(FEATURE_TESTS_EXTRA)
+endif
+
  FEATURE_DISPLAY ?=                     \
         dwarf                           \
         glibc                           \
@@ -68,6 +88,7 @@ FEATURE_DISPLAY ?=                    \
         libperl                         \
         libpython                       \
         libslang                        \
+       libcrypto                       \
         libunwind                       \
         libdw-dwarf-unwind              \
         zlib                            \
@@ -100,6 +121,14 @@ ifeq ($(feature-all), 1)
    # test-all.c passed - just set all the core feature flags to 1:
    #
    $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
+  #
+  # test-all.c does not comprise these tests, so we need to
+  # for this case to get features proper values
+  #
+  $(call feature_check,compile-32)
+  $(call feature_check,compile-x32)
+  $(call feature_check,bionic)
+  $(call feature_check,libbabeltrace)
  else
    $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
  endif
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile

index bf8f0352264dcc8a5a1286bc4add2ca9de481357..c5f4c417428d7099fbe4f487a179b0663f478611 100644 (file)
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -23,6 +23,7 @@ FILES=                                        \
         test-libpython.bin              \
         test-libpython-version.bin      \
         test-libslang.bin               \
+       test-libcrypto.bin              \
         test-libunwind.bin              \
         test-libunwind-debug-frame.bin  \
         test-pthread-attr-setaffinity-np.bin    \
@@ -105,6 +106,9 @@ $(OUTPUT)test-libaudit.bin:
  $(OUTPUT)test-libslang.bin:
         $(BUILD) -I/usr/include/slang -lslang
  
+$(OUTPUT)test-libcrypto.bin:
+       $(BUILD) -lcrypto
+
  $(OUTPUT)test-gtk2.bin:
         $(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
  
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c

index 81025cade45fa9a3fcc6f8f0aacc8f403e6d9211..e499a36c1e4a9e21e9c355309b53a7dc5901664a 100644 (file)
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -129,6 +129,10 @@
  # include "test-bpf.c"
  #undef main
  
+#define main main_test_libcrypto
+# include "test-libcrypto.c"
+#undef main
+
  int main(int argc, char *argv[])
  {
         main_test_libpython();
@@ -158,6 +162,7 @@ int main(int argc, char *argv[])
         main_test_lzma();
         main_test_get_cpuid();
         main_test_bpf();
+       main_test_libcrypto();
  
         return 0;
  }
diff --git a/tools/build/feature/test-compile.c b/tools/build/feature/test-compile.c

index 31dbf45bf99c5996ca0ee62cba186d912b08e666..c54e6551ae4c54cd6e9f418e6e8ff97280444efd 100644 (file)
--- a/tools/build/feature/test-compile.c
+++ b/tools/build/feature/test-compile.c
@@ -1,4 +1,6 @@
+#include <stdio.h>
  int main(void)
  {
+       printf("Hello World!\n");
         return 0;
  }
diff --git a/tools/build/feature/test-libcrypto.c b/tools/build/feature/test-libcrypto.c

new file mode 100644 (file)

index 0000000..bd79dc7
--- /dev/null
+++ b/tools/build/feature/test-libcrypto.c
@@ -0,0 +1,17 @@
+#include <openssl/sha.h>
+#include <openssl/md5.h>
+
+int main(void)
+{
+       MD5_CTX context;
+       unsigned char md[MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH];
+       unsigned char dat[] = "12345";
+
+       MD5_Init(&context);
+       MD5_Update(&context, &dat[0], sizeof(dat));
+       MD5_Final(&md[0], &context);
+
+       SHA1(&dat[0], sizeof(dat), &md[0]);
+
+       return 0;
+}
diff --git a/tools/lib/api/Build b/tools/lib/api/Build

index e8b8a23b9bf4cdac8ccfcf95690f8dfd93e1d6e6..954c644f7ad9e5ee0f54bdf16cf2b68766f1e2a5 100644 (file)
--- a/tools/lib/api/Build
+++ b/tools/lib/api/Build
@@ -1,3 +1,4 @@
  libapi-y += fd/
  libapi-y += fs/
  libapi-y += cpu.o
+libapi-y += debug.o
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile

index d85904dc9b38747dc0feda60f819cf93ceabbdcf..bbc82c614bee62eec8563c5b1dd9fd222eafa5de 100644 (file)
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -18,6 +18,7 @@ LIBFILE = $(OUTPUT)libapi.a
  CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
  CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
  CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+CFLAGS += -I$(srctree)/tools/lib/api
  
  RM = rm -f
  
diff --git a/tools/lib/api/debug-internal.h b/tools/lib/api/debug-internal.h

new file mode 100644 (file)

index 0000000..188f788
--- /dev/null
+++ b/tools/lib/api/debug-internal.h
@@ -0,0 +1,20 @@
+#ifndef __API_DEBUG_INTERNAL_H__
+#define __API_DEBUG_INTERNAL_H__
+
+#include "debug.h"
+
+#define __pr(func, fmt, ...)   \
+do {                           \
+       if ((func))             \
+               (func)("libapi: " fmt, ##__VA_ARGS__); \
+} while (0)
+
+extern libapi_print_fn_t __pr_warning;
+extern libapi_print_fn_t __pr_info;
+extern libapi_print_fn_t __pr_debug;
+
+#define pr_warning(fmt, ...)   __pr(__pr_warning, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...)      __pr(__pr_info, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...)     __pr(__pr_debug, fmt, ##__VA_ARGS__)
+
+#endif /* __API_DEBUG_INTERNAL_H__ */
diff --git a/tools/lib/api/debug.c b/tools/lib/api/debug.c

new file mode 100644 (file)

index 0000000..5fa5cf5
--- /dev/null
+++ b/tools/lib/api/debug.c
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include "debug.h"
+#include "debug-internal.h"
+
+static int __base_pr(const char *format, ...)
+{
+       va_list args;
+       int err;
+
+       va_start(args, format);
+       err = vfprintf(stderr, format, args);
+       va_end(args);
+       return err;
+}
+
+libapi_print_fn_t __pr_warning = __base_pr;
+libapi_print_fn_t __pr_info    = __base_pr;
+libapi_print_fn_t __pr_debug;
+
+void libapi_set_print(libapi_print_fn_t warn,
+                     libapi_print_fn_t info,
+                     libapi_print_fn_t debug)
+{
+       __pr_warning = warn;
+       __pr_info    = info;
+       __pr_debug   = debug;
+}
diff --git a/tools/lib/api/debug.h b/tools/lib/api/debug.h

new file mode 100644 (file)

index 0000000..a0872f6
--- /dev/null
+++ b/tools/lib/api/debug.h
@@ -0,0 +1,10 @@
+#ifndef __API_DEBUG_H__
+#define __API_DEBUG_H__
+
+typedef int (*libapi_print_fn_t)(const char *, ...);
+
+void libapi_set_print(libapi_print_fn_t warn,
+                     libapi_print_fn_t info,
+                     libapi_print_fn_t debug);
+
+#endif /* __API_DEBUG_H__ */
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c

index 459599d1b6c410b7b41333c13f248ee685c2dd4a..ef78c22ff44d4142f01a1f5aea3bbe4e15fd0101 100644 (file)
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -13,6 +13,7 @@
  #include <sys/mount.h>
  
  #include "fs.h"
+#include "debug-internal.h"
  
  #define _STR(x) #x
  #define STR(x) _STR(x)
@@ -300,6 +301,56 @@ int filename__read_ull(const char *filename, unsigned long long *value)
         return err;
  }
  
+#define STRERR_BUFSIZE  128     /* For the buffer size of strerror_r */
+
+int filename__read_str(const char *filename, char **buf, size_t *sizep)
+{
+       size_t size = 0, alloc_size = 0;
+       void *bf = NULL, *nbf;
+       int fd, n, err = 0;
+       char sbuf[STRERR_BUFSIZE];
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+
+       do {
+               if (size == alloc_size) {
+                       alloc_size += BUFSIZ;
+                       nbf = realloc(bf, alloc_size);
+                       if (!nbf) {
+                               err = -ENOMEM;
+                               break;
+                       }
+
+                       bf = nbf;
+               }
+
+               n = read(fd, bf + size, alloc_size - size);
+               if (n < 0) {
+                       if (size) {
+                               pr_warning("read failed %d: %s\n", errno,
+                                        strerror_r(errno, sbuf, sizeof(sbuf)));
+                               err = 0;
+                       } else
+                               err = -errno;
+
+                       break;
+               }
+
+               size += n;
+       } while (n > 0);
+
+       if (!err) {
+               *sizep = size;
+               *buf   = bf;
+       } else
+               free(bf);
+
+       close(fd);
+       return err;
+}
+
  int sysfs__read_ull(const char *entry, unsigned long long *value)
  {
         char path[PATH_MAX];
@@ -326,6 +377,19 @@ int sysfs__read_int(const char *entry, int *value)
         return filename__read_int(path, value);
  }
  
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+       char path[PATH_MAX];
+       const char *sysfs = sysfs__mountpoint();
+
+       if (!sysfs)
+               return -1;
+
+       snprintf(path, sizeof(path), "%s/%s", sysfs, entry);
+
+       return filename__read_str(path, buf, sizep);
+}
+
  int sysctl__read_int(const char *sysctl, int *value)
  {
         char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h

index d024a7f682f69b27a4bba1aed7b8f1894d07b50f..9f6598098dc5804a5bf660013ad5b07c499b6169 100644 (file)
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -2,6 +2,7 @@
  #define __API_FS__
  
  #include <stdbool.h>
+#include <unistd.h>
  
  /*
   * On most systems <limits.h> would have given us this, but  not on some systems
@@ -26,8 +27,10 @@ FS(tracefs)
  
  int filename__read_int(const char *filename, int *value);
  int filename__read_ull(const char *filename, unsigned long long *value);
+int filename__read_str(const char *filename, char **buf, size_t *sizep);
  
  int sysctl__read_int(const char *sysctl, int *value);
  int sysfs__read_int(const char *entry, int *value);
  int sysfs__read_ull(const char *entry, unsigned long long *value);
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
  #endif /* __API_FS__ */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c

index 8334a5a9d5d7f55e97144e1d7462c52deec02e02..7e543c3102d4118a09717bb6fb4c0f779532ae91 100644 (file)
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -201,6 +201,7 @@ struct bpf_object {
                         Elf_Data *data;
                 } *reloc;
                 int nr_reloc;
+               int maps_shndx;
         } efile;
         /*
          * All loaded bpf_object is linked in a list, which is
@@ -350,6 +351,7 @@ static struct bpf_object *bpf_object__new(const char *path,
          */
         obj->efile.obj_buf = obj_buf;
         obj->efile.obj_buf_sz = obj_buf_sz;
+       obj->efile.maps_shndx = -1;
  
         obj->loaded = false;
  
@@ -529,12 +531,12 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
  }
  
  static int
-bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
+bpf_object__init_maps_name(struct bpf_object *obj)
  {
         int i;
         Elf_Data *symbols = obj->efile.symbols;
  
-       if (!symbols || maps_shndx < 0)
+       if (!symbols || obj->efile.maps_shndx < 0)
                 return -EINVAL;
  
         for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
@@ -544,7 +546,7 @@ bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
  
                 if (!gelf_getsym(symbols, i, &sym))
                         continue;
-               if (sym.st_shndx != maps_shndx)
+               if (sym.st_shndx != obj->efile.maps_shndx)
                         continue;
  
                 map_name = elf_strptr(obj->efile.elf,
@@ -572,7 +574,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
         Elf *elf = obj->efile.elf;
         GElf_Ehdr *ep = &obj->efile.ehdr;
         Elf_Scn *scn = NULL;
-       int idx = 0, err = 0, maps_shndx = -1;
+       int idx = 0, err = 0;
  
         /* Elf is corrupted/truncated, avoid calling elf_strptr. */
         if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) {
@@ -625,7 +627,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                 else if (strcmp(name, "maps") == 0) {
                         err = bpf_object__init_maps(obj, data->d_buf,
                                                     data->d_size);
-                       maps_shndx = idx;
+                       obj->efile.maps_shndx = idx;
                 } else if (sh.sh_type == SHT_SYMTAB) {
                         if (obj->efile.symbols) {
                                 pr_warning("bpf: multiple SYMTAB in %s\n",
@@ -674,8 +676,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                 pr_warning("Corrupted ELF file: index of strtab invalid\n");
                 return LIBBPF_ERRNO__FORMAT;
         }
-       if (maps_shndx >= 0)
-               err = bpf_object__init_maps_name(obj, maps_shndx);
+       if (obj->efile.maps_shndx >= 0)
+               err = bpf_object__init_maps_name(obj);
  out:
         return err;
  }
@@ -697,7 +699,8 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx)
  static int
  bpf_program__collect_reloc(struct bpf_program *prog,
                            size_t nr_maps, GElf_Shdr *shdr,
-                          Elf_Data *data, Elf_Data *symbols)
+                          Elf_Data *data, Elf_Data *symbols,
+                          int maps_shndx)
  {
         int i, nrels;
  
@@ -724,9 +727,6 @@ bpf_program__collect_reloc(struct bpf_program *prog,
                         return -LIBBPF_ERRNO__FORMAT;
                 }
  
-               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
-               pr_debug("relocation: insn_idx=%u\n", insn_idx);
-
                 if (!gelf_getsym(symbols,
                                  GELF_R_SYM(rel.r_info),
                                  &sym)) {
@@ -735,6 +735,15 @@ bpf_program__collect_reloc(struct bpf_program *prog,
                         return -LIBBPF_ERRNO__FORMAT;
                 }
  
+               if (sym.st_shndx != maps_shndx) {
+                       pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n",
+                                  prog->section_name, sym.st_shndx);
+                       return -LIBBPF_ERRNO__RELOC;
+               }
+
+               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+               pr_debug("relocation: insn_idx=%u\n", insn_idx);
+
                 if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
                         pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n",
                                    insn_idx, insns[insn_idx].code);
@@ -863,7 +872,8 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
  
                 err = bpf_program__collect_reloc(prog, nr_maps,
                                                  shdr, data,
-                                                obj->efile.symbols);
+                                                obj->efile.symbols,
+                                                obj->efile.maps_shndx);
                 if (err)
                         return err;
         }
diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile

index 90d2baeb621a5265f703ecc0e3bc0cbde60ac1f4..1d57af56814b8671c10d9619202f042ad5c031d4 100644 (file)
--- a/tools/lib/lockdep/Makefile
+++ b/tools/lib/lockdep/Makefile
@@ -100,7 +100,7 @@ include $(srctree)/tools/build/Makefile.include
  
  do_compile_shared_library =                    \
         ($(print_shared_lib_compile)            \
-       $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -s $@ liblockdep.so))
+       $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so))
  
  do_build_static_lib =                          \
         ($(print_static_lib_build)              \
diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c

index 9be663340f0a4c48e9ec51c996b13c3a8b596fdc..d1c89cc06f5f6710471c42dc5b2a475a54ee7658 100644 (file)
--- a/tools/lib/lockdep/common.c
+++ b/tools/lib/lockdep/common.c
@@ -11,11 +11,6 @@ static __thread struct task_struct current_obj;
  bool debug_locks = true;
  bool debug_locks_silent;
  
-__attribute__((constructor)) static void liblockdep_init(void)
-{
-       lockdep_init();
-}
-
  __attribute__((destructor)) static void liblockdep_exit(void)
  {
         debug_check_no_locks_held();
diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h

index a60c14b9662aefd6f17b47dec534bce2f53d9f03..6e66277ec4375c84693b25756fdf8336faf79fcf 100644 (file)
--- a/tools/lib/lockdep/include/liblockdep/common.h
+++ b/tools/lib/lockdep/include/liblockdep/common.h
@@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
  void lock_release(struct lockdep_map *lock, int nested,
                         unsigned long ip);
  extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void lockdep_init(void);
  
  #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
         { .name = (_name), .key = (void *)(_key), }
diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c

index f42b7e9aa48f2661fb94fced614db8f443eca9c4..a0a2e3a266af8122fed3d66b3f5a537ab3c1c773 100644 (file)
--- a/tools/lib/lockdep/lockdep.c
+++ b/tools/lib/lockdep/lockdep.c
@@ -1,2 +1,8 @@
  #include <linux/lockdep.h>
+
+/* Trivial API wrappers, we don't (yet) have RCU in user-space: */
+#define hlist_for_each_entry_rcu       hlist_for_each_entry
+#define hlist_add_head_rcu             hlist_add_head
+#define hlist_del_rcu                  hlist_del
+
  #include "../../../kernel/locking/lockdep.c"
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c

index 21cdf869a01b8b7facc084c4dc66cf73afa163bb..52844847569c99b878c031da8f25a13d1c0b5838 100644 (file)
--- a/tools/lib/lockdep/preload.c
+++ b/tools/lib/lockdep/preload.c
@@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void)
         ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock");
  #endif
  
-       lockdep_init();
-
         __init_state = done;
  }
diff --git a/tools/lib/lockdep/tests/AA.c b/tools/lib/lockdep/tests/AA.c

index 0f782ff404ac65999c8689555967eabb9c6f0e52..18211a5f354fe53e2a0650fe1ca9150956fd5b7a 100644 (file)
--- a/tools/lib/lockdep/tests/AA.c
+++ b/tools/lib/lockdep/tests/AA.c
@@ -1,13 +1,13 @@
  #include <liblockdep/mutex.h>
  
-void main(void)
+int main(void)
  {
-       pthread_mutex_t a, b;
+       pthread_mutex_t a;
  
         pthread_mutex_init(&a, NULL);
-       pthread_mutex_init(&b, NULL);
  
         pthread_mutex_lock(&a);
-       pthread_mutex_lock(&b);
         pthread_mutex_lock(&a);
+
+       return 0;
  }
diff --git a/tools/lib/lockdep/tests/ABA.c b/tools/lib/lockdep/tests/ABA.c

new file mode 100644 (file)

index 0000000..0f782ff
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABA.c
@@ -0,0 +1,13 @@
+#include <liblockdep/mutex.h>
+
+void main(void)
+{
+       pthread_mutex_t a, b;
+
+       pthread_mutex_init(&a, NULL);
+       pthread_mutex_init(&b, NULL);
+
+       pthread_mutex_lock(&a);
+       pthread_mutex_lock(&b);
+       pthread_mutex_lock(&a);
+}
diff --git a/tools/lib/lockdep/tests/ABBA_2threads.c b/tools/lib/lockdep/tests/ABBA_2threads.c

new file mode 100644 (file)

index 0000000..cd807d7
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABBA_2threads.c
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <pthread.h>
+
+pthread_mutex_t a = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t b = PTHREAD_MUTEX_INITIALIZER;
+pthread_barrier_t bar;
+
+void *ba_lock(void *arg)
+{
+       int ret, i;
+
+       pthread_mutex_lock(&b);
+
+       if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+               pthread_barrier_destroy(&bar);
+
+       pthread_mutex_lock(&a);
+
+       pthread_mutex_unlock(&a);
+       pthread_mutex_unlock(&b);
+}
+
+int main(void)
+{
+       pthread_t t;
+
+       pthread_barrier_init(&bar, NULL, 2);
+
+       if (pthread_create(&t, NULL, ba_lock, NULL)) {
+               fprintf(stderr, "pthread_create() failed\n");
+               return 1;
+       }
+       pthread_mutex_lock(&a);
+
+       if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+               pthread_barrier_destroy(&bar);
+
+       pthread_mutex_lock(&b);
+
+       pthread_mutex_unlock(&b);
+       pthread_mutex_unlock(&a);
+
+       pthread_join(t, NULL);
+
+       return 0;
+}
diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h

index 6386dc3182a0985a40551d0af980e53102e04c2a..fd3e56a83fc27025a9016cae2eaf1882c751cc01 100644 (file)
--- a/tools/lib/lockdep/uinclude/linux/compiler.h
+++ b/tools/lib/lockdep/uinclude/linux/compiler.h
@@ -3,6 +3,7 @@
  
  #define __used         __attribute__((__unused__))
  #define unlikely
+#define READ_ONCE(x) (x)
  #define WRITE_ONCE(x, val) x=(val)
  #define RCU_INIT_POINTER(p, v) p=(v)
  
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c

index c3bd294a63d1f7e4a75b2a950bbe398f33703cd6..190cc886ab9105ea6be58ad58caf8e940274c4a9 100644 (file)
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1951,6 +1951,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
                    strcmp(token, "*") == 0 ||
                    strcmp(token, "^") == 0 ||
                    strcmp(token, "/") == 0 ||
+                  strcmp(token, "%") == 0 ||
                    strcmp(token, "<") == 0 ||
                    strcmp(token, ">") == 0 ||
                    strcmp(token, "<=") == 0 ||
@@ -2397,6 +2398,12 @@ static int arg_num_eval(struct print_arg *arg, long long *val)
                                 break;
                         *val = left + right;
                         break;
+               case '~':
+                       ret = arg_num_eval(arg->op.right, &right);
+                       if (!ret)
+                               break;
+                       *val = ~right;
+                       break;
                 default:
                         do_warning("unknown op '%s'", arg->op.op);
                         ret = 0;
@@ -2634,6 +2641,7 @@ process_hex(struct event_format *event, struct print_arg *arg, char **tok)
  
  free_field:
         free_arg(arg->hex.field);
+       arg->hex.field = NULL;
  out:
         *tok = NULL;
         return EVENT_ERROR;
@@ -2658,8 +2666,10 @@ process_int_array(struct event_format *event, struct print_arg *arg, char **tok)
  
  free_size:
         free_arg(arg->int_array.count);
+       arg->int_array.count = NULL;
  free_field:
         free_arg(arg->int_array.field);
+       arg->int_array.field = NULL;
  out:
         *tok = NULL;
         return EVENT_ERROR;
@@ -3689,6 +3699,9 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg
                 case '/':
                         val = left / right;
                         break;
+               case '%':
+                       val = left % right;
+                       break;
                 case '*':
                         val = left * right;
                         break;
@@ -4971,7 +4984,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
                                                 break;
                                         }
                                 }
-                               if (pevent->long_size == 8 && ls &&
+                               if (pevent->long_size == 8 && ls == 1 &&
                                     sizeof(long) != 8) {
                                         char *p;
  
@@ -5335,41 +5348,45 @@ static bool is_timestamp_in_us(char *trace_clock, bool use_trace_clock)
         return false;
  }
  
-void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
-                       struct pevent_record *record, bool use_trace_clock)
+/**
+ * pevent_find_event_by_record - return the event from a given record
+ * @pevent: a handle to the pevent
+ * @record: The record to get the event from
+ *
+ * Returns the associated event for a given record, or NULL if non is
+ * is found.
+ */
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record)
  {
-       static const char *spaces = "                    "; /* 20 spaces */
-       struct event_format *event;
-       unsigned long secs;
-       unsigned long usecs;
-       unsigned long nsecs;
-       const char *comm;
-       void *data = record->data;
         int type;
-       int pid;
-       int len;
-       int p;
-       bool use_usec_format;
-
-       use_usec_format = is_timestamp_in_us(pevent->trace_clock,
-                                                       use_trace_clock);
-       if (use_usec_format) {
-               secs = record->ts / NSECS_PER_SEC;
-               nsecs = record->ts - secs * NSECS_PER_SEC;
-       }
  
         if (record->size < 0) {
                 do_warning("ug! negative record size %d", record->size);
-               return;
+               return NULL;
         }
  
-       type = trace_parse_common_type(pevent, data);
+       type = trace_parse_common_type(pevent, record->data);
  
-       event = pevent_find_event(pevent, type);
-       if (!event) {
-               do_warning("ug! no event found for type %d", type);
-               return;
-       }
+       return pevent_find_event(pevent, type);
+}
+
+/**
+ * pevent_print_event_task - Write the event task comm, pid and CPU
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the tasks comm, pid and CPU to @s.
+ */
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record)
+{
+       void *data = record->data;
+       const char *comm;
+       int pid;
  
         pid = parse_common_pid(pevent, data);
         comm = find_cmdline(pevent, pid);
@@ -5377,9 +5394,43 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
         if (pevent->latency_format) {
                 trace_seq_printf(s, "%8.8s-%-5d %3d",
                        comm, pid, record->cpu);
-               pevent_data_lat_fmt(pevent, s, record);
         } else
                 trace_seq_printf(s, "%16s-%-5d [%03d]", comm, pid, record->cpu);
+}
+
+/**
+ * pevent_print_event_time - Write the event timestamp
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ * @use_trace_clock: Set to parse according to the @pevent->trace_clock
+ *
+ * Writes the timestamp of the record into @s.
+ */
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record,
+                            bool use_trace_clock)
+{
+       unsigned long secs;
+       unsigned long usecs;
+       unsigned long nsecs;
+       int p;
+       bool use_usec_format;
+
+       use_usec_format = is_timestamp_in_us(pevent->trace_clock,
+                                                       use_trace_clock);
+       if (use_usec_format) {
+               secs = record->ts / NSECS_PER_SEC;
+               nsecs = record->ts - secs * NSECS_PER_SEC;
+       }
+
+       if (pevent->latency_format) {
+               trace_seq_printf(s, " %3d", record->cpu);
+               pevent_data_lat_fmt(pevent, s, record);
+       } else
+               trace_seq_printf(s, " [%03d]", record->cpu);
  
         if (use_usec_format) {
                 if (pevent->flags & PEVENT_NSEC_OUTPUT) {
@@ -5387,14 +5438,36 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
                         p = 9;
                 } else {
                         usecs = (nsecs + 500) / NSECS_PER_USEC;
+                       /* To avoid usecs larger than 1 sec */
+                       if (usecs >= 1000000) {
+                               usecs -= 1000000;
+                               secs++;
+                       }
                         p = 6;
                 }
  
-               trace_seq_printf(s, " %5lu.%0*lu: %s: ",
-                                       secs, p, usecs, event->name);
+               trace_seq_printf(s, " %5lu.%0*lu:", secs, p, usecs);
         } else
-               trace_seq_printf(s, " %12llu: %s: ",
-                                       record->ts, event->name);
+               trace_seq_printf(s, " %12llu:", record->ts);
+}
+
+/**
+ * pevent_print_event_data - Write the event data section
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the parsing of the record's data to @s.
+ */
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record)
+{
+       static const char *spaces = "                    "; /* 20 spaces */
+       int len;
+
+       trace_seq_printf(s, " %s: ", event->name);
  
         /* Space out the event names evenly. */
         len = strlen(event->name);
@@ -5404,6 +5477,23 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
         pevent_event_info(s, event, record);
  }
  
+void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
+                       struct pevent_record *record, bool use_trace_clock)
+{
+       struct event_format *event;
+
+       event = pevent_find_event_by_record(pevent, record);
+       if (!event) {
+               do_warning("ug! no event found for type %d",
+                          trace_parse_common_type(pevent, record->data));
+               return;
+       }
+
+       pevent_print_event_task(pevent, s, event, record);
+       pevent_print_event_time(pevent, s, event, record, use_trace_clock);
+       pevent_print_event_data(pevent, s, event, record);
+}
+
  static int events_id_cmp(const void *a, const void *b)
  {
         struct event_format * const * ea = a;
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h

index 706d9bc24066cf308ba17d718eafd95c4ff5550c..9ffde377e89d912ffb2127fe454872346759738b 100644 (file)
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -628,6 +628,16 @@ int pevent_register_print_string(struct pevent *pevent, const char *fmt,
                                  unsigned long long addr);
  int pevent_pid_is_registered(struct pevent *pevent, int pid);
  
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record);
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record,
+                            bool use_trace_clock);
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record);
  void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
                         struct pevent_record *record, bool use_trace_clock);
  
@@ -694,6 +704,9 @@ struct event_format *pevent_find_event(struct pevent *pevent, int id);
  struct event_format *
  pevent_find_event_by_name(struct pevent *pevent, const char *sys, const char *name);
  
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record);
+
  void pevent_data_lat_fmt(struct pevent *pevent,
                          struct trace_seq *s, struct pevent_record *record);
  int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt

index b9ca1e304158da86f36dafad33e7de0a52e01567..15949e2a7805cc6e7d848ddbf2f99b249e332f3b 100644 (file)
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -8,7 +8,7 @@ perf-config - Get and set variables in a configuration file.
  SYNOPSIS
  --------
  [verse]
-'perf config' -l | --list
+'perf config' [<file-option>] -l | --list
  
  DESCRIPTION
  -----------
@@ -21,6 +21,14 @@ OPTIONS
  --list::
         Show current config variables, name and value, for all sections.
  
+--user::
+       For writing and reading options: write to user
+       '$HOME/.perfconfig' file or read it.
+
+--system::
+       For writing and reading options: write to system-wide
+       '$(sysconfdir)/perfconfig' or read it.
+
  CONFIGURATION FILE
  ------------------
  
@@ -30,6 +38,10 @@ The '$HOME/.perfconfig' file is used to store a per-user configuration.
  The file '$(sysconfdir)/perfconfig' can be used to
  store a system-wide default configuration.
  
+When reading or writing, the values are read from the system and user
+configuration files by default, and options '--system' and '--user'
+can be used to tell the command to read from or write to only that location.
+
  Syntax
  ~~~~~~
  
@@ -62,7 +74,7 @@ Given a $HOME/.perfconfig like this:
                 medium = green, default
                 normal = lightgray, default
                 selected = white, lightgray
-               code = blue, default
+               jump_arrows = blue, default
                 addr = magenta, default
                 root = white, blue
  
@@ -98,6 +110,347 @@ Given a $HOME/.perfconfig like this:
                 order = caller
                 sort-key = function
  
+Variables
+~~~~~~~~~
+
+colors.*::
+       The variables for customizing the colors used in the output for the
+       'report', 'top' and 'annotate' in the TUI. They should specify the
+       foreground and background colors, separated by a comma, for example:
+
+               medium = green, lightgray
+
+       If you want to use the color configured for you terminal, just leave it
+       as 'default', for example:
+
+               medium = default, lightgray
+
+       Available colors:
+       red, yellow, green, cyan, gray, black, blue,
+       white, default, magenta, lightgray
+
+       colors.top::
+               'top' means a overhead percentage which is more than 5%.
+               And values of this variable specify percentage colors.
+               Basic key values are foreground-color 'red' and
+               background-color 'default'.
+       colors.medium::
+               'medium' means a overhead percentage which has more than 0.5%.
+               Default values are 'green' and 'default'.
+       colors.normal::
+               'normal' means the rest of overhead percentages
+               except 'top', 'medium', 'selected'.
+               Default values are 'lightgray' and 'default'.
+       colors.selected::
+               This selects the colors for the current entry in a list of entries
+               from sub-commands (top, report, annotate).
+               Default values are 'black' and 'lightgray'.
+       colors.jump_arrows::
+               Colors for jump arrows on assembly code listings
+               such as 'jns', 'jmp', 'jane', etc.
+               Default values are 'blue', 'default'.
+       colors.addr::
+               This selects colors for addresses from 'annotate'.
+               Default values are 'magenta', 'default'.
+       colors.root::
+               Colors for headers in the output of a sub-commands (top, report).
+               Default values are 'white', 'blue'.
+
+tui.*, gtk.*::
+       Subcommands that can be configured here are 'top', 'report' and 'annotate'.
+       These values are booleans, for example:
+
+       [tui]
+               top = true
+
+       will make the TUI be the default for the 'top' subcommand. Those will be
+       available if the required libs were detected at tool build time.
+
+buildid.*::
+       buildid.dir::
+               Each executable and shared library in modern distributions comes with a
+               content based identifier that, if available, will be inserted in a
+               'perf.data' file header to, at analysis time find what is needed to do
+               symbol resolution, code annotation, etc.
+
+               The recording tools also stores a hard link or copy in a per-user
+               directory, $HOME/.debug/, of binaries, shared libraries, /proc/kallsyms
+               and /proc/kcore files to be used at analysis time.
+
+               The buildid.dir variable can be used to either change this directory
+               cache location, or to disable it altogether. If you want to disable it,
+               set buildid.dir to /dev/null. The default is $HOME/.debug
+
+annotate.*::
+       These options work only for TUI.
+       These are in control of addresses, jump function, source code
+       in lines of assembly code from a specific program.
+
+       annotate.hide_src_code::
+               If a program which is analyzed has source code,
+               this option lets 'annotate' print a list of assembly code with the source code.
+               For example, let's see a part of a program. There're four lines.
+               If this option is 'true', they can be printed
+               without source code from a program as below.
+
+               │        push   %rbp
+               │        mov    %rsp,%rbp
+               │        sub    $0x10,%rsp
+               │        mov    (%rdi),%rdx
+
+               But if this option is 'false', source code of the part
+               can be also printed as below. Default is 'false'.
+
+               │      struct rb_node *rb_next(const struct rb_node *node)
+               │      {
+               │        push   %rbp
+               │        mov    %rsp,%rbp
+               │        sub    $0x10,%rsp
+               │              struct rb_node *parent;
+               │
+               │              if (RB_EMPTY_NODE(node))
+               │        mov    (%rdi),%rdx
+               │              return n;
+
+        annotate.use_offset::
+               Basing on a first address of a loaded function, offset can be used.
+               Instead of using original addresses of assembly code,
+               addresses subtracted from a base address can be printed.
+               Let's illustrate an example.
+               If a base address is 0XFFFFFFFF81624d50 as below,
+
+               ffffffff81624d50 <load0>
+
+               an address on assembly code has a specific absolute address as below
+
+               ffffffff816250b8:│  mov    0x8(%r14),%rdi
+
+               but if use_offset is 'true', an address subtracted from a base address is printed.
+               Default is true. This option is only applied to TUI.
+
+                            368:│  mov    0x8(%r14),%rdi
+
+       annotate.jump_arrows::
+               There can be jump instruction among assembly code.
+               Depending on a boolean value of jump_arrows,
+               arrows can be printed or not which represent
+               where do the instruction jump into as below.
+
+               │     ┌──jmp    1333
+               │     │  xchg   %ax,%ax
+               │1330:│  mov    %r15,%r10
+               │1333:└─→cmp    %r15,%r14
+
+               If jump_arrow is 'false', the arrows isn't printed as below.
+               Default is 'false'.
+
+               │      ↓ jmp    1333
+               │        xchg   %ax,%ax
+               │1330:   mov    %r15,%r10
+               │1333:   cmp    %r15,%r14
+
+        annotate.show_linenr::
+               When showing source code if this option is 'true',
+               line numbers are printed as below.
+
+               │1628         if (type & PERF_SAMPLE_IDENTIFIER) {
+               │     ↓ jne    508
+               │1628                 data->id = *array;
+               │1629                 array++;
+               │1630         }
+
+               However if this option is 'false', they aren't printed as below.
+               Default is 'false'.
+
+               │             if (type & PERF_SAMPLE_IDENTIFIER) {
+               │     ↓ jne    508
+               │                     data->id = *array;
+               │                     array++;
+               │             }
+
+        annotate.show_nr_jumps::
+               Let's see a part of assembly code.
+
+               │1382:   movb   $0x1,-0x270(%rbp)
+
+               If use this, the number of branches jumping to that address can be printed as below.
+               Default is 'false'.
+
+               │1 1382:   movb   $0x1,-0x270(%rbp)
+
+        annotate.show_total_period::
+               To compare two records on an instruction base, with this option
+               provided, display total number of samples that belong to a line
+               in assembly code. If this option is 'true', total periods are printed
+               instead of percent values as below.
+
+                 302 │      mov    %eax,%eax
+
+               But if this option is 'false', percent values for overhead are printed i.e.
+               Default is 'false'.
+
+               99.93 │      mov    %eax,%eax
+
+hist.*::
+       hist.percentage::
+               This option control the way to calculate overhead of filtered entries -
+               that means the value of this option is effective only if there's a
+               filter (by comm, dso or symbol name). Suppose a following example:
+
+                      Overhead  Symbols
+                      ........  .......
+                       33.33%     foo
+                       33.33%     bar
+                       33.33%     baz
+
+              This is an original overhead and we'll filter out the first 'foo'
+              entry. The value of 'relative' would increase the overhead of 'bar'
+              and 'baz' to 50.00% for each, while 'absolute' would show their
+              current overhead (33.33%).
+
+ui.*::
+       ui.show-headers::
+               This option controls display of column headers (like 'Overhead' and 'Symbol')
+               in 'report' and 'top'. If this option is false, they are hidden.
+               This option is only applied to TUI.
+
+call-graph.*::
+       When sub-commands 'top' and 'report' work with -g/—-children
+       there're options in control of call-graph.
+
+       call-graph.record-mode::
+               The record-mode can be 'fp' (frame pointer), 'dwarf' and 'lbr'.
+               The value of 'dwarf' is effective only if perf detect needed library
+               (libunwind or a recent version of libdw).
+               'lbr' only work for cpus that support it.
+
+       call-graph.dump-size::
+               The size of stack to dump in order to do post-unwinding. Default is 8192 (byte).
+               When using dwarf into record-mode, the default size will be used if omitted.
+
+       call-graph.print-type::
+               The print-types can be graph (graph absolute), fractal (graph relative),
+               flat and folded. This option controls a way to show overhead for each callchain
+               entry. Suppose a following example.
+
+                Overhead  Symbols
+                ........  .......
+                  40.00%  foo
+                          |
+                          ---foo
+                             |
+                             |--50.00%--bar
+                             |          main
+                             |
+                              --50.00%--baz
+                                        main
+
+               This output is a 'fractal' format. The 'foo' came from 'bar' and 'baz' exactly
+               half and half so 'fractal' shows 50.00% for each
+               (meaning that it assumes 100% total overhead of 'foo').
+
+               The 'graph' uses absolute overhead value of 'foo' as total so each of
+               'bar' and 'baz' callchain will have 20.00% of overhead.
+               If 'flat' is used, single column and linear exposure of call chains.
+               'folded' mean call chains are displayed in a line, separated by semicolons.
+
+       call-graph.order::
+               This option controls print order of callchains. The default is
+               'callee' which means callee is printed at top and then followed by its
+               caller and so on. The 'caller' prints it in reverse order.
+
+               If this option is not set and report.children or top.children is
+               set to true (or the equivalent command line option is given),
+               the default value of this option is changed to 'caller' for the
+               execution of 'perf report' or 'perf top'. Other commands will
+               still default to 'callee'.
+
+       call-graph.sort-key::
+               The callchains are merged if they contain same information.
+               The sort-key option determines a way to compare the callchains.
+               A value of 'sort-key' can be 'function' or 'address'.
+               The default is 'function'.
+
+       call-graph.threshold::
+               When there're many callchains it'd print tons of lines. So perf omits
+               small callchains under a certain overhead (threshold) and this option
+               control the threshold. Default is 0.5 (%). The overhead is calculated
+               by value depends on call-graph.print-type.
+
+       call-graph.print-limit::
+               This is a maximum number of lines of callchain printed for a single
+               histogram entry. Default is 0 which means no limitation.
+
+report.*::
+       report.percent-limit::
+               This one is mostly the same as call-graph.threshold but works for
+               histogram entries. Entries having an overhead lower than this
+               percentage will not be printed. Default is '0'. If percent-limit
+               is '10', only entries which have more than 10% of overhead will be
+               printed.
+
+       report.queue-size::
+               This option sets up the maximum allocation size of the internal
+               event queue for ordering events. Default is 0, meaning no limit.
+
+       report.children::
+               'Children' means functions called from another function.
+               If this option is true, 'perf report' cumulates callchains of children
+               and show (accumulated) total overhead as well as 'Self' overhead.
+               Please refer to the 'perf report' manual. The default is 'true'.
+
+       report.group::
+               This option is to show event group information together.
+               Example output with this turned on, notice that there is one column
+               per event in the group, ref-cycles and cycles:
+
+               # group: {ref-cycles,cycles}
+               # ========
+               #
+               # Samples: 7K of event 'anon group { ref-cycles, cycles }'
+               # Event count (approx.): 6876107743
+               #
+               #         Overhead  Command      Shared Object               Symbol
+               # ................  .......  .................  ...................
+               #
+                   99.84%  99.76%  noploop  noploop            [.] main
+                    0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
+                    0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
+
+top.*::
+       top.children::
+               Same as 'report.children'. So if it is enabled, the output of 'top'
+               command will have 'Children' overhead column as well as 'Self' overhead
+               column by default.
+               The default is 'true'.
+
+man.*::
+       man.viewer::
+               This option can assign a tool to view manual pages when 'help'
+               subcommand was invoked. Supported tools are 'man', 'woman'
+               (with emacs client) and 'konqueror'. Default is 'man'.
+
+               New man viewer tool can be also added using 'man.<tool>.cmd'
+               or use different path using 'man.<tool>.path' config option.
+
+pager.*::
+       pager.<subcommand>::
+               When the subcommand is run on stdio, determine whether it uses
+               pager or not based on this value. Default is 'unspecified'.
+
+kmem.*::
+       kmem.default::
+               This option decides which allocator is to be analyzed if neither
+               '--slab' nor '--page' option is used. Default is 'slab'.
+
+record.*::
+       record.build-id::
+               This option can be 'cache', 'no-cache' or 'skip'.
+               'cache' is to post-process data and save/update the binaries into
+               the build-id cache (in ~/.debug). This is the default.
+               But if this option is 'no-cache', it will not update the build-id cache.
+               'skip' skips post-processing and does not update the cache.
+
  SEE ALSO
  --------
  linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt

index 0b1cedeef895369c9c60aa0a1e6cbb718c65c509..87b2588d1cbdee6060a304be4f79015cb5889171 100644 (file)
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -53,6 +53,13 @@ include::itrace.txt[]
  --strip::
         Use with --itrace to strip out non-synthesized events.
  
+-j::
+--jit::
+       Process jitdump files by injecting the mmap records corresponding to jitted
+       functions. This option also generates the ELF images for each jitted function
+       found in the jitdumps files captured in the input perf.data file. Use this option
+       if you are monitoring environment using JIT runtimes, such as Java, DART or V8.
+
  SEE ALSO
  --------
  linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

index fbceb631387c31c44002080130b7ff6e282229d3..19aa17532a16709646dc52487c5f686675c2b658 100644 (file)
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -341,6 +341,12 @@ Specify vmlinux path which has debuginfo.
  --buildid-all::
  Record build-id of all DSOs regardless whether it's actually hit or not.
  
+--all-kernel::
+Configure all used events to run in kernel space.
+
+--all-user::
+Configure all used events to run in user space.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index 8a301f6afb37ad855351fcd5fba1644d12dc2529..12113992ac9d0f5ceca0b003cec568717c4208e5 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -117,6 +117,22 @@ OPTIONS
         And default sort keys are changed to comm, dso_from, symbol_from, dso_to
         and symbol_to, see '--branch-stack'.
  
+       If the --mem-mode option is used, the following sort keys are also available
+       (incompatible with --branch-stack):
+       symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
+
+       - symbol_daddr: name of data symbol being executed on at the time of sample
+       - dso_daddr: name of library or module containing the data being executed
+       on at the time of the sample
+       - locked: whether the bus was locked at the time of the sample
+       - tlb: type of tlb access for the data at the time of the sample
+       - mem: type of memory access for the data at the time of the sample
+       - snoop: type of snoop (if any) for the data at the time of the sample
+       - dcacheline: the cacheline the data address is on at the time of the sample
+
+       And the default sort keys are changed to local_weight, mem, sym, dso,
+       symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
+
         If the data file has tracepoint event(s), following (dynamic) sort keys
         are also available:
         trace, trace_fields, [<event>.]<field>[/raw]
@@ -151,22 +167,6 @@ OPTIONS
         By default, every sort keys not specified in -F will be appended
         automatically.
  
-       If --mem-mode option is used, following sort keys are also available
-       (incompatible with --branch-stack):
-       symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
-
-       - symbol_daddr: name of data symbol being executed on at the time of sample
-       - dso_daddr: name of library or module containing the data being executed
-       on at the time of sample
-       - locked: whether the bus was locked at the time of sample
-       - tlb: type of tlb access for the data at the time of sample
-       - mem: type of memory access for the data at the time of sample
-       - snoop: type of snoop (if any) for the data at the time of sample
-       - dcacheline: the cacheline the data address is on at the time of sample
-
-       And default sort keys are changed to local_weight, mem, sym, dso,
-       symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
-
  -p::
  --parent=<regex>::
          A regex filter to identify parent. The parent is a caller of this
@@ -351,7 +351,10 @@ OPTIONS
  
  --percent-limit::
         Do not show entries which have an overhead under that percent.
-       (Default: 0).
+       (Default: 0).  Note that this option also sets the percent limit (threshold)
+       of callchains.  However the default value of callchain threshold is
+       different than the default value of hist entries.  Please see the
+       --call-graph option for details.
  
  --percentage::
         Determine how to display the overhead percentage of filtered entries.
@@ -398,6 +401,9 @@ include::itrace.txt[]
  --raw-trace::
         When displaying traceevent output, do not use print fmt or plugins.
  
+--hierarchy::
+       Enable hierarchical output.
+
  include::callchain-overhead-calculation.txt[]
  
  SEE ALSO
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt

index 52ef7a9d50aacbc3d3fbb0366852457d39718cdf..04f23b404bbc5a7bd4b5bcdfb7e833d79ba1234e 100644 (file)
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -69,6 +69,14 @@ report::
  --scale::
         scale/normalize counter values
  
+-d::
+--detailed::
+       print more detailed statistics, can be specified up to 3 times
+
+          -d:          detailed events, L1 and LLC data cache
+        -d -d:     more detailed events, dTLB and iTLB events
+     -d -d -d:     very detailed events, adding prefetch events
+
  -r::
  --repeat=<n>::
         repeat command and print average + stddev (max: 100). 0 means forever.
@@ -139,6 +147,10 @@ Print count deltas every N milliseconds (minimum: 10ms)
  The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals.  Use with caution.
         example: 'perf stat -I 1000 -e cycles -a sleep 5'
  
+--metric-only::
+Only print computed metrics. Print them in a single line.
+Don't show any raw values. Not supported with --per-thread.
+
  --per-socket::
  Aggregate counts per processor socket for system-wide mode measurements.  This
  is a useful mode to detect imbalance between sockets.  To enable this mode,
@@ -211,6 +223,29 @@ $ perf stat -- make -j
  
   Wall-clock time elapsed:   719.554352 msecs
  
+CSV FORMAT
+----------
+
+With -x, perf stat is able to output a not-quite-CSV format output
+Commas in the output are not put into "". To make it easy to parse
+it is recommended to use a different character like -x \;
+
+The fields are in this order:
+
+       - optional usec time stamp in fractions of second (with -I xxx)
+       - optional CPU, core, or socket identifier
+       - optional number of logical CPUs aggregated
+       - counter value
+       - unit of the counter value or empty
+       - event name
+       - run time of counter
+       - percentage of measurement time the counter was running
+       - optional variance if multiple values are collected with -r
+       - optional metric value
+       - optional unit of metric
+
+Additional metrics may be printed with all earlier fields being empty.
+
  SEE ALSO
  --------
  linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt

index b0e60e17db389d89efafa7025cd47fd44fe625c3..19f046f027cd81e42c5696ab3172539baaeb745d 100644 (file)
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -233,6 +233,9 @@ Default is to monitor all CPUS.
  --raw-trace::
         When displaying traceevent output, do not use print fmt or plugins.
  
+--hierarchy::
+       Enable hierarchy output.
+
  INTERACTIVE PROMPTING KEYS
  --------------------------
  
diff --git a/tools/perf/Documentation/perfconfig.example b/tools/perf/Documentation/perfconfig.example

index 767ea2436e1cd841a762ee8cdb039ba80a7120b3..1d8d5bc4cd2de6601f926696d5c172b30fa735d4 100644 (file)
--- a/tools/perf/Documentation/perfconfig.example
+++ b/tools/perf/Documentation/perfconfig.example
@@ -5,7 +5,7 @@
         medium = green, lightgray
         normal = black, lightgray
         selected = lightgray, magenta
-       code = blue, lightgray
+       jump_arrows = blue, lightgray
         addr = magenta, lightgray
  
  [tui]
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt

index e0ce9573b79bd26ddfe8cb4aee6eade061a09a12..5950b5a24efdf6024ca58a2da084d9cd43295c12 100644 (file)
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -27,3 +27,4 @@ Skip collecing build-id when recording: perf record -B
  To change sampling frequency to 100 Hz: perf record -F 100
  See assembly instructions with percentage: perf annotate <symbol>
  If you prefer Intel style assembly, try: perf annotate -M intel
+For hierarchical output, try: perf report --hierarchy
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index dcd9a70c7193b43b5791058b01a3773da6e0d870..32a64e6190288e5110cfa6de78138aceb955e7af 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -68,6 +68,20 @@ all tags TAGS:
         $(print_msg)
         $(make)
  
+ifdef MAKECMDGOALS
+has_clean := 0
+ifneq ($(filter clean,$(MAKECMDGOALS)),)
+  has_clean := 1
+endif # clean
+
+ifeq ($(has_clean),1)
+  rest := $(filter-out clean,$(MAKECMDGOALS))
+  ifneq ($(rest),)
+$(rest): clean
+  endif # rest
+endif # has_clean
+endif # MAKECMDGOALS
+
  #
  # The clean target is not really parallel, don't print the jobs info:
  #
@@ -75,10 +89,17 @@ clean:
         $(make)
  
  #
-# The build-test target is not really parallel, don't print the jobs info:
+# The build-test target is not really parallel, don't print the jobs info,
+# it also uses only the tests/make targets that don't pollute the source
+# repository, i.e. that uses O= or builds the tarpkg outside the source
+# repo directories.
+#
+# For a full test, use:
+#
+# make -C tools/perf -f tests/make
  #
  build-test:
-       @$(MAKE) SHUF=1 -f tests/make --no-print-directory
+       @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out
  
  #
  # All other targets get passed through:
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

index 5d34815c7ccb8e02ac6d4db0d82d4712af73f343..4a4fad4182f534f23550ca5ac1ed5d42000f9b08 100644 (file)
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -58,6 +58,9 @@ include config/utilities.mak
  #
  # Define NO_LIBBIONIC if you do not want bionic support
  #
+# Define NO_LIBCRYPTO if you do not want libcrypto (openssl) support
+# used for generating build-ids for ELFs generated by jitdump.
+#
  # Define NO_LIBDW_DWARF_UNWIND if you do not want libdw support
  # for dwarf backtrace post unwind.
  #
@@ -136,6 +139,8 @@ $(call allow-override,CC,$(CROSS_COMPILE)gcc)
  $(call allow-override,AR,$(CROSS_COMPILE)ar)
  $(call allow-override,LD,$(CROSS_COMPILE)ld)
  
+LD += $(EXTRA_LDFLAGS)
+
  PKG_CONFIG = $(CROSS_COMPILE)pkg-config
  
  RM      = rm -f
@@ -165,7 +170,16 @@ ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
  endif
  endif
  
+# Set FEATURE_TESTS to 'all' so all possible feature checkers are executed.
+# Without this setting the output feature dump file misses some features, for
+# example, liberty. Select all checkers so we won't get an incomplete feature
+# dump file.
  ifeq ($(config),1)
+ifdef MAKECMDGOALS
+ifeq ($(filter feature-dump,$(MAKECMDGOALS)),feature-dump)
+FEATURE_TESTS := all
+endif
+endif
  include config/Makefile
  endif
  
@@ -618,7 +632,7 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean
         $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
         $(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
                 $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
-               $(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c
+               $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c
         $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
         $(python-clean)
  
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile

index 7fbca175099ec917ad69b8025c8249ee6c52a6a4..18b13518d8d8701137d489a65f7dd752bdc7f43f 100644 (file)
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -1,3 +1,4 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile

index 7fbca175099ec917ad69b8025c8249ee6c52a6a4..18b13518d8d8701137d489a65f7dd752bdc7f43f 100644 (file)
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -1,3 +1,4 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile

index 7fbca175099ec917ad69b8025c8249ee6c52a6a4..56e05f126ad8793d25bb1a70ce3dac5f100d3b3a 100644 (file)
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,3 +1,6 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+
+HAVE_KVM_STAT_SUPPORT := 1
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build

index 7b8b0d1a1b626065e0b414f42a7a998723e0e57f..c8fe2074d2177f0709f0e1d54f63cd77f84f2233 100644 (file)
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,5 +1,6 @@
  libperf-y += header.o
  libperf-y += sym-handling.o
+libperf-y += kvm-stat.o
  
  libperf-$(CONFIG_DWARF) += dwarf-regs.o
  libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h b/tools/perf/arch/powerpc/util/book3s_hcalls.h

new file mode 100644 (file)

index 0000000..0dd6b7f
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/book3s_hcalls.h
@@ -0,0 +1,123 @@
+#ifndef ARCH_PERF_BOOK3S_HV_HCALLS_H
+#define ARCH_PERF_BOOK3S_HV_HCALLS_H
+
+/*
+ * PowerPC HCALL codes : hcall code to name mapping
+ */
+#define kvm_trace_symbol_hcall \
+       {0x4, "H_REMOVE"},                                      \
+       {0x8, "H_ENTER"},                                       \
+       {0xc, "H_READ"},                                        \
+       {0x10, "H_CLEAR_MOD"},                                  \
+       {0x14, "H_CLEAR_REF"},                                  \
+       {0x18, "H_PROTECT"},                                    \
+       {0x1c, "H_GET_TCE"},                                    \
+       {0x20, "H_PUT_TCE"},                                    \
+       {0x24, "H_SET_SPRG0"},                                  \
+       {0x28, "H_SET_DABR"},                                   \
+       {0x2c, "H_PAGE_INIT"},                                  \
+       {0x30, "H_SET_ASR"},                                    \
+       {0x34, "H_ASR_ON"},                                     \
+       {0x38, "H_ASR_OFF"},                                    \
+       {0x3c, "H_LOGICAL_CI_LOAD"},                            \
+       {0x40, "H_LOGICAL_CI_STORE"},                           \
+       {0x44, "H_LOGICAL_CACHE_LOAD"},                         \
+       {0x48, "H_LOGICAL_CACHE_STORE"},                        \
+       {0x4c, "H_LOGICAL_ICBI"},                               \
+       {0x50, "H_LOGICAL_DCBF"},                               \
+       {0x54, "H_GET_TERM_CHAR"},                              \
+       {0x58, "H_PUT_TERM_CHAR"},                              \
+       {0x5c, "H_REAL_TO_LOGICAL"},                            \
+       {0x60, "H_HYPERVISOR_DATA"},                            \
+       {0x64, "H_EOI"},                                        \
+       {0x68, "H_CPPR"},                                       \
+       {0x6c, "H_IPI"},                                        \
+       {0x70, "H_IPOLL"},                                      \
+       {0x74, "H_XIRR"},                                       \
+       {0x78, "H_MIGRATE_DMA"},                                \
+       {0x7c, "H_PERFMON"},                                    \
+       {0xdc, "H_REGISTER_VPA"},                               \
+       {0xe0, "H_CEDE"},                                       \
+       {0xe4, "H_CONFER"},                                     \
+       {0xe8, "H_PROD"},                                       \
+       {0xec, "H_GET_PPP"},                                    \
+       {0xf0, "H_SET_PPP"},                                    \
+       {0xf4, "H_PURR"},                                       \
+       {0xf8, "H_PIC"},                                        \
+       {0xfc, "H_REG_CRQ"},                                    \
+       {0x100, "H_FREE_CRQ"},                                  \
+       {0x104, "H_VIO_SIGNAL"},                                \
+       {0x108, "H_SEND_CRQ"},                                  \
+       {0x110, "H_COPY_RDMA"},                                 \
+       {0x114, "H_REGISTER_LOGICAL_LAN"},                      \
+       {0x118, "H_FREE_LOGICAL_LAN"},                          \
+       {0x11c, "H_ADD_LOGICAL_LAN_BUFFER"},                    \
+       {0x120, "H_SEND_LOGICAL_LAN"},                          \
+       {0x124, "H_BULK_REMOVE"},                               \
+       {0x130, "H_MULTICAST_CTRL"},                            \
+       {0x134, "H_SET_XDABR"},                                 \
+       {0x138, "H_STUFF_TCE"},                                 \
+       {0x13c, "H_PUT_TCE_INDIRECT"},                          \
+       {0x14c, "H_CHANGE_LOGICAL_LAN_MAC"},                    \
+       {0x150, "H_VTERM_PARTNER_INFO"},                        \
+       {0x154, "H_REGISTER_VTERM"},                            \
+       {0x158, "H_FREE_VTERM"},                                \
+       {0x15c, "H_RESET_EVENTS"},                              \
+       {0x160, "H_ALLOC_RESOURCE"},                            \
+       {0x164, "H_FREE_RESOURCE"},                             \
+       {0x168, "H_MODIFY_QP"},                                 \
+       {0x16c, "H_QUERY_QP"},                                  \
+       {0x170, "H_REREGISTER_PMR"},                            \
+       {0x174, "H_REGISTER_SMR"},                              \
+       {0x178, "H_QUERY_MR"},                                  \
+       {0x17c, "H_QUERY_MW"},                                  \
+       {0x180, "H_QUERY_HCA"},                                 \
+       {0x184, "H_QUERY_PORT"},                                \
+       {0x188, "H_MODIFY_PORT"},                               \
+       {0x18c, "H_DEFINE_AQP1"},                               \
+       {0x190, "H_GET_TRACE_BUFFER"},                          \
+       {0x194, "H_DEFINE_AQP0"},                               \
+       {0x198, "H_RESIZE_MR"},                                 \
+       {0x19c, "H_ATTACH_MCQP"},                               \
+       {0x1a0, "H_DETACH_MCQP"},                               \
+       {0x1a4, "H_CREATE_RPT"},                                \
+       {0x1a8, "H_REMOVE_RPT"},                                \
+       {0x1ac, "H_REGISTER_RPAGES"},                           \
+       {0x1b0, "H_DISABLE_AND_GETC"},                          \
+       {0x1b4, "H_ERROR_DATA"},                                \
+       {0x1b8, "H_GET_HCA_INFO"},                              \
+       {0x1bc, "H_GET_PERF_COUNT"},                            \
+       {0x1c0, "H_MANAGE_TRACE"},                              \
+       {0x1d4, "H_FREE_LOGICAL_LAN_BUFFER"},                   \
+       {0x1d8, "H_POLL_PENDING"},                              \
+       {0x1e4, "H_QUERY_INT_STATE"},                           \
+       {0x244, "H_ILLAN_ATTRIBUTES"},                          \
+       {0x250, "H_MODIFY_HEA_QP"},                             \
+       {0x254, "H_QUERY_HEA_QP"},                              \
+       {0x258, "H_QUERY_HEA"},                                 \
+       {0x25c, "H_QUERY_HEA_PORT"},                            \
+       {0x260, "H_MODIFY_HEA_PORT"},                           \
+       {0x264, "H_REG_BCMC"},                                  \
+       {0x268, "H_DEREG_BCMC"},                                \
+       {0x26c, "H_REGISTER_HEA_RPAGES"},                       \
+       {0x270, "H_DISABLE_AND_GET_HEA"},                       \
+       {0x274, "H_GET_HEA_INFO"},                              \
+       {0x278, "H_ALLOC_HEA_RESOURCE"},                        \
+       {0x284, "H_ADD_CONN"},                                  \
+       {0x288, "H_DEL_CONN"},                                  \
+       {0x298, "H_JOIN"},                                      \
+       {0x2a4, "H_VASI_STATE"},                                \
+       {0x2b0, "H_ENABLE_CRQ"},                                \
+       {0x2b8, "H_GET_EM_PARMS"},                              \
+       {0x2d0, "H_SET_MPP"},                                   \
+       {0x2d4, "H_GET_MPP"},                                   \
+       {0x2ec, "H_HOME_NODE_ASSOCIATIVITY"},                   \
+       {0x2f4, "H_BEST_ENERGY"},                               \
+       {0x2fc, "H_XIRR_X"},                                    \
+       {0x300, "H_RANDOM"},                                    \
+       {0x304, "H_COP"},                                       \
+       {0x314, "H_GET_MPP_X"},                                 \
+       {0x31c, "H_SET_MODE"},                                  \
+       {0xf000, "H_RTAS"}                                      \
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h

new file mode 100644 (file)

index 0000000..e68ba2d
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
@@ -0,0 +1,33 @@
+#ifndef ARCH_PERF_BOOK3S_HV_EXITS_H
+#define ARCH_PERF_BOOK3S_HV_EXITS_H
+
+/*
+ * PowerPC Interrupt vectors : exit code to name mapping
+ */
+
+#define kvm_trace_symbol_exit \
+       {0x0,   "RETURN_TO_HOST"}, \
+       {0x100, "SYSTEM_RESET"}, \
+       {0x200, "MACHINE_CHECK"}, \
+       {0x300, "DATA_STORAGE"}, \
+       {0x380, "DATA_SEGMENT"}, \
+       {0x400, "INST_STORAGE"}, \
+       {0x480, "INST_SEGMENT"}, \
+       {0x500, "EXTERNAL"}, \
+       {0x501, "EXTERNAL_LEVEL"}, \
+       {0x502, "EXTERNAL_HV"}, \
+       {0x600, "ALIGNMENT"}, \
+       {0x700, "PROGRAM"}, \
+       {0x800, "FP_UNAVAIL"}, \
+       {0x900, "DECREMENTER"}, \
+       {0x980, "HV_DECREMENTER"}, \
+       {0xc00, "SYSCALL"}, \
+       {0xd00, "TRACE"}, \
+       {0xe00, "H_DATA_STORAGE"}, \
+       {0xe20, "H_INST_STORAGE"}, \
+       {0xe40, "H_EMUL_ASSIST"}, \
+       {0xf00, "PERFMON"}, \
+       {0xf20, "ALTIVEC"}, \
+       {0xf40, "VSX"}
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c

new file mode 100644 (file)

index 0000000..74eee30
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -0,0 +1,170 @@
+#include "util/kvm-stat.h"
+#include "util/parse-events.h"
+#include "util/debug.h"
+
+#include "book3s_hv_exits.h"
+#include "book3s_hcalls.h"
+
+#define NR_TPS 4
+
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 40;
+const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
+const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
+
+define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
+define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
+
+/* Tracepoints specific to ppc_book3s_hv */
+const char *ppc_book3s_hv_kvm_tp[] = {
+       "kvm_hv:kvm_guest_enter",
+       "kvm_hv:kvm_guest_exit",
+       "kvm_hv:kvm_hcall_enter",
+       "kvm_hv:kvm_hcall_exit",
+       NULL,
+};
+
+/* 1 extra placeholder for NULL */
+const char *kvm_events_tp[NR_TPS + 1];
+const char *kvm_exit_reason;
+
+static void hcall_event_get_key(struct perf_evsel *evsel,
+                               struct perf_sample *sample,
+                               struct event_key *key)
+{
+       key->info = 0;
+       key->key = perf_evsel__intval(evsel, sample, "req");
+}
+
+static const char *get_hcall_exit_reason(u64 exit_code)
+{
+       struct exit_reasons_table *tbl = hcall_reasons;
+
+       while (tbl->reason != NULL) {
+               if (tbl->exit_code == exit_code)
+                       return tbl->reason;
+               tbl++;
+       }
+
+       pr_debug("Unknown hcall code: %lld\n",
+              (unsigned long long)exit_code);
+       return "UNKNOWN";
+}
+
+static bool hcall_event_end(struct perf_evsel *evsel,
+                           struct perf_sample *sample __maybe_unused,
+                           struct event_key *key __maybe_unused)
+{
+       return (!strcmp(evsel->name, kvm_events_tp[3]));
+}
+
+static bool hcall_event_begin(struct perf_evsel *evsel,
+                             struct perf_sample *sample, struct event_key *key)
+{
+       if (!strcmp(evsel->name, kvm_events_tp[2])) {
+               hcall_event_get_key(evsel, sample, key);
+               return true;
+       }
+
+       return false;
+}
+static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+                                  struct event_key *key,
+                                  char *decode)
+{
+       const char *hcall_reason = get_hcall_exit_reason(key->key);
+
+       scnprintf(decode, decode_str_len, "%s", hcall_reason);
+}
+
+static struct kvm_events_ops hcall_events = {
+       .is_begin_event = hcall_event_begin,
+       .is_end_event = hcall_event_end,
+       .decode_key = hcall_event_decode_key,
+       .name = "HCALL-EVENT",
+};
+
+static struct kvm_events_ops exit_events = {
+       .is_begin_event = exit_event_begin,
+       .is_end_event = exit_event_end,
+       .decode_key = exit_event_decode_key,
+       .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+       { .name = "vmexit", .ops = &exit_events },
+       { .name = "hcall", .ops = &hcall_events },
+       { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+       NULL,
+};
+
+
+static int is_tracepoint_available(const char *str, struct perf_evlist *evlist)
+{
+       struct parse_events_error err;
+       int ret;
+
+       err.str = NULL;
+       ret = parse_events(evlist, str, &err);
+       if (err.str)
+               pr_err("%s : %s\n", str, err.str);
+       return ret;
+}
+
+static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
+                               struct perf_evlist *evlist)
+{
+       const char **events_ptr;
+       int i, nr_tp = 0, err = -1;
+
+       /* Check for book3s_hv tracepoints */
+       for (events_ptr = ppc_book3s_hv_kvm_tp; *events_ptr; events_ptr++) {
+               err = is_tracepoint_available(*events_ptr, evlist);
+               if (err)
+                       return -1;
+               nr_tp++;
+       }
+
+       for (i = 0; i < nr_tp; i++)
+               kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
+
+       kvm_events_tp[i] = NULL;
+       kvm_exit_reason = "trap";
+       kvm->exit_reasons = hv_exit_reasons;
+       kvm->exit_reasons_isa = "HV";
+
+       return 0;
+}
+
+/* Wrapper to setup kvm tracepoints */
+static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
+{
+       struct perf_evlist *evlist = perf_evlist__new();
+
+       if (evlist == NULL)
+               return -ENOMEM;
+
+       /* Right now, only supported on book3s_hv */
+       return ppc__setup_book3s_hv(kvm, evlist);
+}
+
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+{
+       return ppc__setup_kvm_tp(kvm);
+}
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+       int ret;
+
+       ret = ppc__setup_kvm_tp(kvm);
+       if (ret) {
+               kvm->exit_reasons = NULL;
+               kvm->exit_reasons_isa = NULL;
+       }
+
+       return ret;
+}
diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c

index a5dbc07ec9dc70900ed4a593f5979d419061fd35..ed57df2e6d687d89ccf95d9aa03c263411de7fe9 100644 (file)
--- a/tools/perf/arch/s390/util/kvm-stat.c
+++ b/tools/perf/arch/s390/util/kvm-stat.c
@@ -10,7 +10,7 @@
   */
  
  #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/sie.h>
  
  define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
  define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
@@ -18,6 +18,12 @@ define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
  define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
  define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
  
+const char *vcpu_id_str = "id";
+const int decode_str_len = 40;
+const char *kvm_exit_reason = "icptcode";
+const char *kvm_entry_trace = "kvm:kvm_s390_sie_enter";
+const char *kvm_exit_trace = "kvm:kvm_s390_sie_exit";
+
  static void event_icpt_insn_get_key(struct perf_evsel *evsel,
                                     struct perf_sample *sample,
                                     struct event_key *key)
@@ -73,7 +79,7 @@ static struct kvm_events_ops exit_events = {
         .name = "VM-EXIT"
  };
  
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
         "kvm:kvm_s390_sie_enter",
         "kvm:kvm_s390_sie_exit",
         "kvm:kvm_s390_intercept_instruction",
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile

index 09ba923debe86810f8380f7df54504dee4232ec8..269af21437353b2fb886383ede1e3f9a2f586f25 100644 (file)
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
  endif
  HAVE_KVM_STAT_SUPPORT := 1
  PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c

index 7bb0d13c235f70326afc28d726f38f1d95499055..72193f19d6d75d3c32e19b424a0f1910ebbc24a6 100644 (file)
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -59,7 +59,7 @@ static u64 mmap_read_self(void *addr)
                 u64 quot, rem;
  
                 quot = (cyc >> time_shift);
-               rem = cyc & ((1 << time_shift) - 1);
+               rem = cyc & (((u64)1 << time_shift) - 1);
                 delta = time_offset + quot * time_mult +
                         ((rem * time_mult) >> time_shift);
  
@@ -103,6 +103,7 @@ static int __test__rdpmc(void)
  
         sigfillset(&sa.sa_mask);
         sa.sa_sigaction = segfault_handler;
+       sa.sa_flags = 0;
         sigaction(SIGSEGV, &sa, NULL);
  
         fd = sys_perf_event_open(&attr, 0, -1, -1,
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c

index 8d8150f1cf9bcf426b4c29a19170221db680a980..d66f9ad4df2ea5da6eca22cae351a842d1143508 100644 (file)
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -60,7 +60,9 @@ struct branch {
         u64 misc;
  };
  
-static size_t intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused)
  {
         return INTEL_BTS_AUXTRACE_PRIV_SIZE;
  }
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c

index f05daacc9e7810117cfe25415a75990b3aa89f63..a3395179c9eebd5fcd39aa4796a0dc3d495f4b5d 100644 (file)
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -89,7 +89,7 @@ static int intel_pt_parse_terms_with_default(struct list_head *formats,
  
         *config = attr.config;
  out_free:
-       parse_events__free_terms(terms);
+       parse_events_terms__delete(terms);
         return err;
  }
  
@@ -273,7 +273,9 @@ intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu)
         return attr;
  }
  
-static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                       struct perf_evlist *evlist __maybe_unused)
  {
         return INTEL_PT_AUXTRACE_PRIV_SIZE;
  }
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c

index 14e4e668fad733c1802908047ae524318aa586de..b63d4be655a24a678f69d6deea3b4ee87283b938 100644 (file)
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -1,5 +1,7 @@
  #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
  
  define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
  define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
@@ -11,6 +13,12 @@ static struct kvm_events_ops exit_events = {
         .name = "VM-EXIT"
  };
  
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 20;
+const char *kvm_exit_reason = "exit_reason";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
  /*
   * For the mmio events, we treat:
   * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
@@ -65,7 +73,7 @@ static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
                                   struct event_key *key,
                                   char *decode)
  {
-       scnprintf(decode, DECODE_STR_LEN, "%#lx:%s",
+       scnprintf(decode, decode_str_len, "%#lx:%s",
                   (unsigned long)key->key,
                   key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
  }
@@ -109,7 +117,7 @@ static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
                                     struct event_key *key,
                                     char *decode)
  {
-       scnprintf(decode, DECODE_STR_LEN, "%#llx:%s",
+       scnprintf(decode, decode_str_len, "%#llx:%s",
                   (unsigned long long)key->key,
                   key->info ? "POUT" : "PIN");
  }
@@ -121,7 +129,7 @@ static struct kvm_events_ops ioport_events = {
         .name = "IO Port Access"
  };
  
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
         "kvm:kvm_entry",
         "kvm:kvm_exit",
         "kvm:kvm_mmio",
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S

index e4c2c30143b95133913a2bc3569b2871b79aa75c..5c3cce082cb88c5cab984bdb1ae6944274861ad4 100644 (file)
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,6 +1,11 @@
+
+/* Various wrappers to make the kernel .S file build in user-space: */
+
  #define memcpy MEMCPY /* don't hide glibc's memcpy() */
  #define altinstr_replacement text
  #define globl p2align 4; .globl
+#define _ASM_EXTABLE_FAULT(x, y)
+
  #include "../../../arch/x86/lib/memcpy_64.S"
  /*
   * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index cc5c1267c738da038cfb6b824f79854fab41a629..cfe366375c4b89bb3642f5531d04b418a8d76e56 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -245,7 +245,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
                         hists__collapse_resort(hists, NULL);
                         /* Don't sort callchain */
                         perf_evsel__reset_sample_bit(pos, CALLCHAIN);
-                       hists__output_resort(hists, NULL);
+                       perf_evsel__output_resort(pos, NULL);
  
                         if (symbol_conf.event_group &&
                             !perf_evsel__is_group_leader(pos))
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c

index d93bff7fc0e407d6518a6ce0e2ee0dd4b65bf910..632efc6b79a07e20c4b25f9757bcd9c089050270 100644 (file)
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -38,19 +38,7 @@ static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
  
  static int build_id_cache__kcore_dir(char *dir, size_t sz)
  {
-       struct timeval tv;
-       struct tm tm;
-       char dt[32];
-
-       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
-               return -1;
-
-       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
-               return -1;
-
-       scnprintf(dir, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
-
-       return 0;
+       return fetch_current_timestamp(dir, sz);
  }
  
  static bool same_kallsyms_reloc(const char *from_dir, char *to_dir)
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c

index f04e804a9fadc6807a7bc74810fac5ddaad912b2..c42448ed5dfe20a74af9c671669aa23a25cadb5b 100644 (file)
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -13,8 +13,10 @@
  #include "util/util.h"
  #include "util/debug.h"
  
+static bool use_system_config, use_user_config;
+
  static const char * const config_usage[] = {
-       "perf config [options]",
+       "perf config [<file-option>] [options]",
         NULL
  };
  
@@ -25,6 +27,8 @@ enum actions {
  static struct option config_options[] = {
         OPT_SET_UINT('l', "list", &actions,
                      "show current config variables", ACTION_LIST),
+       OPT_BOOLEAN(0, "system", &use_system_config, "use system config file"),
+       OPT_BOOLEAN(0, "user", &use_user_config, "use user config file"),
         OPT_END()
  };
  
@@ -42,10 +46,23 @@ static int show_config(const char *key, const char *value,
  int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
  {
         int ret = 0;
+       char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
  
         argc = parse_options(argc, argv, config_options, config_usage,
                              PARSE_OPT_STOP_AT_NON_OPTION);
  
+       if (use_system_config && use_user_config) {
+               pr_err("Error: only one config file at a time\n");
+               parse_options_usage(config_usage, config_options, "user", 0);
+               parse_options_usage(NULL, config_options, "system", 0);
+               return -1;
+       }
+
+       if (use_system_config)
+               config_exclusive_filename = perf_etc_perfconfig();
+       else if (use_user_config)
+               config_exclusive_filename = user_config;
+
         switch (actions) {
         case ACTION_LIST:
                 if (argc) {
@@ -53,9 +70,13 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
                         parse_options_usage(config_usage, config_options, "l", 1);
                 } else {
                         ret = perf_config(show_config, NULL);
-                       if (ret < 0)
+                       if (ret < 0) {
+                               const char * config_filename = config_exclusive_filename;
+                               if (!config_exclusive_filename)
+                                       config_filename = user_config;
                                 pr_err("Nothing configured, "
-                                      "please check your ~/.perfconfig file\n");
+                                      "please check your %s \n", config_filename);
+                       }
                 }
                 break;
         default:
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c

index 36ccc2b8827fdd239b657e79b396ae013c318462..4d72359fd15ad00ca7397bb033e84b4f899546c2 100644 (file)
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1264,8 +1264,6 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
         if (ret < 0)
                 return ret;
  
-       perf_config(perf_default_config, NULL);
-
         argc = parse_options(argc, argv, options, diff_usage, 0);
  
         if (symbol__init(NULL) < 0)
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c

index 96c1a4cfbbbf6b1f639d91d562c55b09da5c7e6f..49d55e21b1b06dbecd9c6935a2e0f33550585514 100644 (file)
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -86,8 +86,7 @@ static int check_emacsclient_version(void)
                 return -1;
         }
  
-       strbuf_remove(&buffer, 0, strlen("emacsclient"));
-       version = atoi(buffer.buf);
+       version = atoi(buffer.buf + strlen("emacsclient"));
  
         if (version < 22) {
                 fprintf(stderr,
@@ -273,7 +272,7 @@ static int perf_help_config(const char *var, const char *value, void *cb)
         if (!prefixcmp(var, "man."))
                 return add_man_viewer_info(var, value);
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static struct cmdnames main_cmds, other_cmds;
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index 0022e02ed31a7034b9286884ab87c4c131e41fa3..7fa68663ed7287f63adad2342db987941ba0c1b4 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -17,6 +17,7 @@
  #include "util/build-id.h"
  #include "util/data.h"
  #include "util/auxtrace.h"
+#include "util/jit.h"
  
  #include <subcmd/parse-options.h>
  
@@ -29,6 +30,7 @@ struct perf_inject {
         bool                    sched_stat;
         bool                    have_auxtrace;
         bool                    strip;
+       bool                    jit_mode;
         const char              *input_name;
         struct perf_data_file   output;
         u64                     bytes_written;
@@ -71,6 +73,15 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
         return perf_event__repipe_synth(tool, event);
  }
  
+#ifdef HAVE_JITDUMP
+static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
+                              union perf_event *event __maybe_unused,
+                              struct ordered_events *oe __maybe_unused)
+{
+       return 0;
+}
+#endif
+
  static int perf_event__repipe_op2_synth(struct perf_tool *tool,
                                         union perf_event *event,
                                         struct perf_session *session
@@ -234,6 +245,31 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
         return err;
  }
  
+#ifdef HAVE_JITDUMP
+static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
+                                      union perf_event *event,
+                                      struct perf_sample *sample,
+                                      struct machine *machine)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+       u64 n = 0;
+       int ret;
+
+       /*
+        * if jit marker, then inject jit mmaps and generate ELF images
+        */
+       ret = jit_process(inject->session, &inject->output, machine,
+                         event->mmap.filename, sample->pid, &n);
+       if (ret < 0)
+               return ret;
+       if (ret) {
+               inject->bytes_written += n;
+               return 0;
+       }
+       return perf_event__repipe_mmap(tool, event, sample, machine);
+}
+#endif
+
  static int perf_event__repipe_mmap2(struct perf_tool *tool,
                                    union perf_event *event,
                                    struct perf_sample *sample,
@@ -247,6 +283,31 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
         return err;
  }
  
+#ifdef HAVE_JITDUMP
+static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
+                                       union perf_event *event,
+                                       struct perf_sample *sample,
+                                       struct machine *machine)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+       u64 n = 0;
+       int ret;
+
+       /*
+        * if jit marker, then inject jit mmaps and generate ELF images
+        */
+       ret = jit_process(inject->session, &inject->output, machine,
+                         event->mmap2.filename, sample->pid, &n);
+       if (ret < 0)
+               return ret;
+       if (ret) {
+               inject->bytes_written += n;
+               return 0;
+       }
+       return perf_event__repipe_mmap2(tool, event, sample, machine);
+}
+#endif
+
  static int perf_event__repipe_fork(struct perf_tool *tool,
                                    union perf_event *event,
                                    struct perf_sample *sample,
@@ -626,12 +687,16 @@ static int __cmd_inject(struct perf_inject *inject)
         ret = perf_session__process_events(session);
  
         if (!file_out->is_pipe) {
-               if (inject->build_ids) {
+               if (inject->build_ids)
                         perf_header__set_feat(&session->header,
                                               HEADER_BUILD_ID);
-                       if (inject->have_auxtrace)
-                               dsos__hit_all(session);
-               }
+               /*
+                * Keep all buildids when there is unprocessed AUX data because
+                * it is not known which ones the AUX trace hits.
+                */
+               if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) &&
+                   inject->have_auxtrace && !inject->itrace_synth_opts.set)
+                       dsos__hit_all(session);
                 /*
                  * The AUX areas have been removed and replaced with
                  * synthesized hardware events, so clear the feature flag and
@@ -703,7 +768,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
         };
         int ret;
  
-       const struct option options[] = {
+       struct option options[] = {
                 OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
                             "Inject build-ids into the output stream"),
                 OPT_STRING('i', "input", &inject.input_name, "file",
@@ -713,6 +778,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                 OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
                             "Merge sched-stat and sched-switch for getting events "
                             "where and how long tasks slept"),
+#ifdef HAVE_JITDUMP
+               OPT_BOOLEAN('j', "jit", &inject.jit_mode, "merge jitdump files into perf.data file"),
+#endif
                 OPT_INCR('v', "verbose", &verbose,
                          "be more verbose (show build ids, etc)"),
                 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
@@ -729,7 +797,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                 "perf inject [<options>]",
                 NULL
         };
-
+#ifndef HAVE_JITDUMP
+       set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
+#endif
         argc = parse_options(argc, argv, options, inject_usage, 0);
  
         /*
@@ -755,6 +825,29 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
         if (inject.session == NULL)
                 return -1;
  
+       if (inject.build_ids) {
+               /*
+                * to make sure the mmap records are ordered correctly
+                * and so that the correct especially due to jitted code
+                * mmaps. We cannot generate the buildid hit list and
+                * inject the jit mmaps at the same time for now.
+                */
+               inject.tool.ordered_events = true;
+               inject.tool.ordering_requires_timestamps = true;
+       }
+#ifdef HAVE_JITDUMP
+       if (inject.jit_mode) {
+               inject.tool.mmap2          = perf_event__jit_repipe_mmap2;
+               inject.tool.mmap           = perf_event__jit_repipe_mmap;
+               inject.tool.ordered_events = true;
+               inject.tool.ordering_requires_timestamps = true;
+               /*
+                * JIT MMAP injection injects all MMAP events in one go, so it
+                * does not obey finished_round semantics.
+                */
+               inject.tool.finished_round = perf_event__drop_oe;
+       }
+#endif
         ret = symbol__init(&inject.session->header.env);
         if (ret < 0)
                 goto out_delete;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c

index 118010553d0cf0dca9de6f493a5c8e267b0fff48..4d3340cce9a0200b469fd61a563eee4cb7f48830 100644 (file)
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1834,7 +1834,7 @@ static int __cmd_record(int argc, const char **argv)
         return cmd_record(i, rec_argv, NULL);
  }
  
-static int kmem_config(const char *var, const char *value, void *cb)
+static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
  {
         if (!strcmp(var, "kmem.default")) {
                 if (!strcmp(value, "slab"))
@@ -1847,7 +1847,7 @@ static int kmem_config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c

index 4418d9214872150648a719dd07fc14a29c913e9b..bff666458b28e24dccac682d0f28b6708a1a7c83 100644 (file)
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -30,7 +30,6 @@
  #include <math.h>
  
  #ifdef HAVE_KVM_STAT_SUPPORT
-#include <asm/kvm_perf.h>
  #include "util/kvm-stat.h"
  
  void exit_event_get_key(struct perf_evsel *evsel,
@@ -38,12 +37,12 @@ void exit_event_get_key(struct perf_evsel *evsel,
                         struct event_key *key)
  {
         key->info = 0;
-       key->key = perf_evsel__intval(evsel, sample, KVM_EXIT_REASON);
+       key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);
  }
  
  bool kvm_exit_event(struct perf_evsel *evsel)
  {
-       return !strcmp(evsel->name, KVM_EXIT_TRACE);
+       return !strcmp(evsel->name, kvm_exit_trace);
  }
  
  bool exit_event_begin(struct perf_evsel *evsel,
@@ -59,7 +58,7 @@ bool exit_event_begin(struct perf_evsel *evsel,
  
  bool kvm_entry_event(struct perf_evsel *evsel)
  {
-       return !strcmp(evsel->name, KVM_ENTRY_TRACE);
+       return !strcmp(evsel->name, kvm_entry_trace);
  }
  
  bool exit_event_end(struct perf_evsel *evsel,
@@ -91,7 +90,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
         const char *exit_reason = get_exit_reason(kvm, key->exit_reasons,
                                                   key->key);
  
-       scnprintf(decode, DECODE_STR_LEN, "%s", exit_reason);
+       scnprintf(decode, decode_str_len, "%s", exit_reason);
  }
  
  static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
@@ -357,7 +356,7 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
         time_diff = sample->time - time_begin;
  
         if (kvm->duration && time_diff > kvm->duration) {
-               char decode[DECODE_STR_LEN];
+               char decode[decode_str_len];
  
                 kvm->events_ops->decode_key(kvm, &event->key, decode);
                 if (!skip_event(decode)) {
@@ -385,7 +384,8 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
                         return NULL;
                 }
  
-               vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample, VCPU_ID);
+               vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample,
+                                                         vcpu_id_str);
                 thread__set_priv(thread, vcpu_record);
         }
  
@@ -574,7 +574,7 @@ static void show_timeofday(void)
  
  static void print_result(struct perf_kvm_stat *kvm)
  {
-       char decode[DECODE_STR_LEN];
+       char decode[decode_str_len];
         struct kvm_event *event;
         int vcpu = kvm->trace_vcpu;
  
@@ -585,7 +585,7 @@ static void print_result(struct perf_kvm_stat *kvm)
  
         pr_info("\n\n");
         print_vcpu_info(kvm);
-       pr_info("%*s ", DECODE_STR_LEN, kvm->events_ops->name);
+       pr_info("%*s ", decode_str_len, kvm->events_ops->name);
         pr_info("%10s ", "Samples");
         pr_info("%9s ", "Samples%");
  
@@ -604,7 +604,7 @@ static void print_result(struct perf_kvm_stat *kvm)
                 min = get_event_min(event, vcpu);
  
                 kvm->events_ops->decode_key(kvm, &event->key, decode);
-               pr_info("%*s ", DECODE_STR_LEN, decode);
+               pr_info("%*s ", decode_str_len, decode);
                 pr_info("%10llu ", (unsigned long long)ecount);
                 pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
                 pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
@@ -1132,6 +1132,11 @@ exit:
                 _p;                     \
         })
  
+int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused)
+{
+       return 0;
+}
+
  static int
  kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
  {
@@ -1148,7 +1153,14 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
                 NULL
         };
         const char * const *events_tp;
+       int ret;
+
         events_tp_size = 0;
+       ret = setup_kvm_events_tp(kvm);
+       if (ret < 0) {
+               pr_err("Unable to setup the kvm tracepoints\n");
+               return ret;
+       }
  
         for (events_tp = kvm_events_tp; *events_tp; events_tp++)
                 events_tp_size++;
@@ -1377,6 +1389,12 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
         /*
          * generate the event list
          */
+       err = setup_kvm_events_tp(kvm);
+       if (err < 0) {
+               pr_err("Unable to setup the kvm tracepoints\n");
+               return err;
+       }
+
         kvm->evlist = kvm_live_event_list();
         if (kvm->evlist == NULL) {
                 err = -1;
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c

index 39017004169665ec4a2ebd59aa0bc43e58412874..88aeac9aa1da12c664ec64ee419d6ce48c5ad8ec 100644 (file)
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -6,6 +6,8 @@
  #include "util/tool.h"
  #include "util/session.h"
  #include "util/data.h"
+#include "util/mem-events.h"
+#include "util/debug.h"
  
  #define MEM_OPERATION_LOAD     0x1
  #define MEM_OPERATION_STORE    0x2
@@ -21,11 +23,56 @@ struct perf_mem {
         DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
  };
  
+static int parse_record_events(const struct option *opt,
+                              const char *str, int unset __maybe_unused)
+{
+       struct perf_mem *mem = *(struct perf_mem **)opt->value;
+       int j;
+
+       if (strcmp(str, "list")) {
+               if (!perf_mem_events__parse(str)) {
+                       mem->operation = 0;
+                       return 0;
+               }
+               exit(-1);
+       }
+
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               struct perf_mem_event *e = &perf_mem_events[j];
+
+               fprintf(stderr, "%-13s%-*s%s\n",
+                       e->tag,
+                       verbose ? 25 : 0,
+                       verbose ? perf_mem_events__name(j) : "",
+                       e->supported ? ": available" : "");
+       }
+       exit(0);
+}
+
+static const char * const __usage[] = {
+       "perf mem record [<options>] [<command>]",
+       "perf mem record [<options>] -- <command> [<options>]",
+       NULL
+};
+
+static const char * const *record_mem_usage = __usage;
+
  static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
  {
         int rec_argc, i = 0, j;
         const char **rec_argv;
         int ret;
+       struct option options[] = {
+       OPT_CALLBACK('e', "event", &mem, "event",
+                    "event selector. use 'perf mem record -e list' to list available events",
+                    parse_record_events),
+       OPT_INCR('v', "verbose", &verbose,
+                "be more verbose (show counter open errors, etc)"),
+       OPT_END()
+       };
+
+       argc = parse_options(argc, argv, options, record_mem_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
  
         rec_argc = argc + 7; /* max number of arguments */
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
@@ -35,23 +82,40 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
         rec_argv[i++] = "record";
  
         if (mem->operation & MEM_OPERATION_LOAD)
+               perf_mem_events[PERF_MEM_EVENTS__LOAD].record = true;
+
+       if (perf_mem_events[PERF_MEM_EVENTS__LOAD].record)
                 rec_argv[i++] = "-W";
  
         rec_argv[i++] = "-d";
  
-       if (mem->operation & MEM_OPERATION_LOAD) {
-               rec_argv[i++] = "-e";
-               rec_argv[i++] = "cpu/mem-loads/pp";
-       }
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               if (!perf_mem_events[j].record)
+                       continue;
+
+               if (!perf_mem_events[j].supported) {
+                       pr_err("failed: event '%s' not supported\n",
+                              perf_mem_events__name(j));
+                       return -1;
+               }
  
-       if (mem->operation & MEM_OPERATION_STORE) {
                 rec_argv[i++] = "-e";
-               rec_argv[i++] = "cpu/mem-stores/pp";
-       }
+               rec_argv[i++] = perf_mem_events__name(j);
+       };
  
-       for (j = 1; j < argc; j++, i++)
+       for (j = 0; j < argc; j++, i++)
                 rec_argv[i] = argv[j];
  
+       if (verbose > 0) {
+               pr_debug("calling: record ");
+
+               while (rec_argv[j]) {
+                       pr_debug("%s ", rec_argv[j]);
+                       j++;
+               }
+               pr_debug("\n");
+       }
+
         ret = cmd_record(i, rec_argv, NULL);
         free(rec_argv);
         return ret;
@@ -298,6 +362,10 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
                 NULL
         };
  
+       if (perf_mem_events__init()) {
+               pr_err("failed: memory events not supported\n");
+               return -1;
+       }
  
         argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
                                         mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index 319712a4e02b73de7359fc2ed772bed1d9cd424f..515510ecc76a43391e2ac58f830557b51810b466 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -32,6 +32,8 @@
  #include "util/parse-branch-options.h"
  #include "util/parse-regs-options.h"
  #include "util/llvm-utils.h"
+#include "util/bpf-loader.h"
+#include "asm/bug.h"
  
  #include <unistd.h>
  #include <sched.h>
@@ -49,7 +51,9 @@ struct record {
         const char              *progname;
         int                     realtime_prio;
         bool                    no_buildid;
+       bool                    no_buildid_set;
         bool                    no_buildid_cache;
+       bool                    no_buildid_cache_set;
         bool                    buildid_all;
         unsigned long long      samples;
  };
@@ -320,7 +324,10 @@ try_again:
                 } else {
                         pr_err("failed to mmap with %d (%s)\n", errno,
                                 strerror_r(errno, msg, sizeof(msg)));
-                       rc = -errno;
+                       if (errno)
+                               rc = -errno;
+                       else
+                               rc = -EINVAL;
                 }
                 goto out;
         }
@@ -464,6 +471,29 @@ static void record__init_features(struct record *rec)
         perf_header__clear_feat(&session->header, HEADER_STAT);
  }
  
+static void
+record__finish_output(struct record *rec)
+{
+       struct perf_data_file *file = &rec->file;
+       int fd = perf_data_file__fd(file);
+
+       if (file->is_pipe)
+               return;
+
+       rec->session->header.data_size += rec->bytes_written;
+       file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
+
+       if (!rec->no_buildid) {
+               process_buildids(rec);
+
+               if (rec->buildid_all)
+                       dsos__hit_all(rec->session);
+       }
+       perf_session__write_header(rec->session, rec->evlist, fd, true);
+
+       return;
+}
+
  static volatile int workload_exec_errno;
  
  /*
@@ -482,6 +512,74 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
  
  static void snapshot_sig_handler(int sig);
  
+static int record__synthesize(struct record *rec)
+{
+       struct perf_session *session = rec->session;
+       struct machine *machine = &session->machines.host;
+       struct perf_data_file *file = &rec->file;
+       struct record_opts *opts = &rec->opts;
+       struct perf_tool *tool = &rec->tool;
+       int fd = perf_data_file__fd(file);
+       int err = 0;
+
+       if (file->is_pipe) {
+               err = perf_event__synthesize_attrs(tool, session,
+                                                  process_synthesized_event);
+               if (err < 0) {
+                       pr_err("Couldn't synthesize attrs.\n");
+                       goto out;
+               }
+
+               if (have_tracepoints(&rec->evlist->entries)) {
+                       /*
+                        * FIXME err <= 0 here actually means that
+                        * there were no tracepoints so its not really
+                        * an error, just that we don't need to
+                        * synthesize anything.  We really have to
+                        * return this more properly and also
+                        * propagate errors that now are calling die()
+                        */
+                       err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
+                                                                 process_synthesized_event);
+                       if (err <= 0) {
+                               pr_err("Couldn't record tracing data.\n");
+                               goto out;
+                       }
+                       rec->bytes_written += err;
+               }
+       }
+
+       if (rec->opts.full_auxtrace) {
+               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+                                       session, process_synthesized_event);
+               if (err)
+                       goto out;
+       }
+
+       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                machine);
+       WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
+                          "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+                          "Check /proc/kallsyms permission or run as root.\n");
+
+       err = perf_event__synthesize_modules(tool, process_synthesized_event,
+                                            machine);
+       WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
+                          "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+                          "Check /proc/modules permission or run as root.\n");
+
+       if (perf_guest) {
+               machines__process_guests(&session->machines,
+                                        perf_event__synthesize_guest_os, tool);
+       }
+
+       err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
+                                           process_synthesized_event, opts->sample_address,
+                                           opts->proc_map_timeout);
+out:
+       return err;
+}
+
  static int __cmd_record(struct record *rec, int argc, const char **argv)
  {
         int err;
@@ -534,6 +632,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                 goto out_child;
         }
  
+       err = bpf__apply_obj_config();
+       if (err) {
+               char errbuf[BUFSIZ];
+
+               bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+               pr_err("ERROR: Apply config to BPF failed: %s\n",
+                        errbuf);
+               goto out_child;
+       }
+
         /*
          * Normally perf_session__new would do this, but it doesn't have the
          * evlist.
@@ -566,63 +674,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
  
         machine = &session->machines.host;
  
-       if (file->is_pipe) {
-               err = perf_event__synthesize_attrs(tool, session,
-                                                  process_synthesized_event);
-               if (err < 0) {
-                       pr_err("Couldn't synthesize attrs.\n");
-                       goto out_child;
-               }
-
-               if (have_tracepoints(&rec->evlist->entries)) {
-                       /*
-                        * FIXME err <= 0 here actually means that
-                        * there were no tracepoints so its not really
-                        * an error, just that we don't need to
-                        * synthesize anything.  We really have to
-                        * return this more properly and also
-                        * propagate errors that now are calling die()
-                        */
-                       err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
-                                                                 process_synthesized_event);
-                       if (err <= 0) {
-                               pr_err("Couldn't record tracing data.\n");
-                               goto out_child;
-                       }
-                       rec->bytes_written += err;
-               }
-       }
-
-       if (rec->opts.full_auxtrace) {
-               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
-                                       session, process_synthesized_event);
-               if (err)
-                       goto out_delete_session;
-       }
-
-       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
-                                                machine);
-       if (err < 0)
-               pr_err("Couldn't record kernel reference relocation symbol\n"
-                      "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-                      "Check /proc/kallsyms permission or run as root.\n");
-
-       err = perf_event__synthesize_modules(tool, process_synthesized_event,
-                                            machine);
+       err = record__synthesize(rec);
         if (err < 0)
-               pr_err("Couldn't record kernel module information.\n"
-                      "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-                      "Check /proc/modules permission or run as root.\n");
-
-       if (perf_guest) {
-               machines__process_guests(&session->machines,
-                                        perf_event__synthesize_guest_os, tool);
-       }
-
-       err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
-                                           process_synthesized_event, opts->sample_address,
-                                           opts->proc_map_timeout);
-       if (err != 0)
                 goto out_child;
  
         if (rec->realtime_prio) {
@@ -758,18 +811,8 @@ out_child:
         /* this will be recalculated during process_buildids() */
         rec->samples = 0;
  
-       if (!err && !file->is_pipe) {
-               rec->session->header.data_size += rec->bytes_written;
-               file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
-
-               if (!rec->no_buildid) {
-                       process_buildids(rec);
-
-                       if (rec->buildid_all)
-                               dsos__hit_all(rec->session);
-               }
-               perf_session__write_header(rec->session, rec->evlist, fd, true);
-       }
+       if (!err)
+               record__finish_output(rec);
  
         if (!err && !quiet) {
                 char samples[128];
@@ -1097,10 +1140,12 @@ struct option __record_options[] = {
         OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
                     "don't sample"),
-       OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
-                   "do not update the buildid cache"),
-       OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
-                   "do not collect buildids in perf.data"),
+       OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
+                       &record.no_buildid_cache_set,
+                       "do not update the buildid cache"),
+       OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
+                       &record.no_buildid_set,
+                       "do not collect buildids in perf.data"),
         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
                      "monitor event in cgroup name only",
                      parse_cgroups),
@@ -1136,6 +1181,12 @@ struct option __record_options[] = {
                         "per thread proc mmap processing timeout in ms"),
         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
                     "Record context switch events"),
+       OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
+                        "Configure all used events to run in kernel space.",
+                        PARSE_OPT_EXCLUSIVE),
+       OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
+                        "Configure all used events to run in user space.",
+                        PARSE_OPT_EXCLUSIVE),
         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
                    "clang binary to use for compiling BPF scriptlets"),
         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 2bf537f190a026952bd63126e9c37def827e9914..7eea49f9ed46b520bdce56972379b76c14302c4b 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -75,7 +75,10 @@ static int report__config(const char *var, const char *value, void *cb)
                 return 0;
         }
         if (!strcmp(var, "report.percent-limit")) {
-               rep->min_percent = strtof(value, NULL);
+               double pcnt = strtof(value, NULL);
+
+               rep->min_percent = pcnt;
+               callchain_param.min_percent = pcnt;
                 return 0;
         }
         if (!strcmp(var, "report.children")) {
@@ -87,7 +90,7 @@ static int report__config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int hist_iter__report_callback(struct hist_entry_iter *iter,
@@ -466,10 +469,11 @@ static int report__browse_hists(struct report *rep)
         return ret;
  }
  
-static void report__collapse_hists(struct report *rep)
+static int report__collapse_hists(struct report *rep)
  {
         struct ui_progress prog;
         struct perf_evsel *pos;
+       int ret = 0;
  
         ui_progress__init(&prog, rep->nr_entries, "Merging related events...");
  
@@ -481,7 +485,9 @@ static void report__collapse_hists(struct report *rep)
  
                 hists->socket_filter = rep->socket_filter;
  
-               hists__collapse_resort(hists, &prog);
+               ret = hists__collapse_resort(hists, &prog);
+               if (ret < 0)
+                       break;
  
                 /* Non-group events are considered as leader */
                 if (symbol_conf.event_group &&
@@ -494,6 +500,7 @@ static void report__collapse_hists(struct report *rep)
         }
  
         ui_progress__finish();
+       return ret;
  }
  
  static void report__output_resort(struct report *rep)
@@ -504,7 +511,7 @@ static void report__output_resort(struct report *rep)
         ui_progress__init(&prog, rep->nr_entries, "Sorting events for output...");
  
         evlist__for_each(rep->session->evlist, pos)
-               hists__output_resort(evsel__hists(pos), &prog);
+               perf_evsel__output_resort(pos, &prog);
  
         ui_progress__finish();
  }
@@ -561,7 +568,11 @@ static int __cmd_report(struct report *rep)
                 }
         }
  
-       report__collapse_hists(rep);
+       ret = report__collapse_hists(rep);
+       if (ret) {
+               ui__error("failed to process hist entry\n");
+               return ret;
+       }
  
         if (session_done())
                 return 0;
@@ -633,8 +644,10 @@ parse_percent_limit(const struct option *opt, const char *str,
                     int unset __maybe_unused)
  {
         struct report *rep = opt->value;
+       double pcnt = strtof(str, NULL);
  
-       rep->min_percent = strtof(str, NULL);
+       rep->min_percent = pcnt;
+       callchain_param.min_percent = pcnt;
         return 0;
  }
  
@@ -798,6 +811,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                     "only show processor socket that match with this filter"),
         OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
                     "Show raw trace event output (do not use print fmt or plugins)"),
+       OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+                   "Show entries in a hierarchy"),
         OPT_END()
         };
         struct perf_data_file file = {
@@ -907,13 +922,19 @@ repeat:
                 symbol_conf.cumulate_callchain = false;
         }
  
-       if (setup_sorting(session->evlist) < 0) {
-               if (sort_order)
-                       parse_options_usage(report_usage, options, "s", 1);
-               if (field_order)
-                       parse_options_usage(sort_order ? NULL : report_usage,
-                                           options, "F", 1);
-               goto error;
+       if (symbol_conf.report_hierarchy) {
+               /* disable incompatible options */
+               symbol_conf.event_group = false;
+               symbol_conf.cumulate_callchain = false;
+
+               if (field_order) {
+                       pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+                       parse_options_usage(report_usage, options, "F", 1);
+                       parse_options_usage(NULL, options, "hierarchy", 0);
+                       goto error;
+               }
+
+               sort__need_collapse = true;
         }
  
         /* Force tty output for header output and per-thread stat. */
@@ -925,6 +946,15 @@ repeat:
         else
                 use_browser = 0;
  
+       if (setup_sorting(session->evlist) < 0) {
+               if (sort_order)
+                       parse_options_usage(report_usage, options, "s", 1);
+               if (field_order)
+                       parse_options_usage(sort_order ? NULL : report_usage,
+                                           options, "F", 1);
+               goto error;
+       }
+
         if (report.header || report.header_only) {
                 perf_session__fprintf_info(session, stdout,
                                            report.show_full_info);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index c691214d820f0050c39e9d376d69cd5891132a47..57f9a7e7f7d3e948a29a92018140902c7f5c840f 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -23,6 +23,7 @@
  #include "util/stat.h"
  #include <linux/bitmap.h>
  #include "asm/bug.h"
+#include "util/mem-events.h"
  
  static char const              *script_name;
  static char const              *generate_script_lang;
@@ -58,6 +59,9 @@ enum perf_output_field {
         PERF_OUTPUT_IREGS           = 1U << 14,
         PERF_OUTPUT_BRSTACK         = 1U << 15,
         PERF_OUTPUT_BRSTACKSYM      = 1U << 16,
+       PERF_OUTPUT_DATA_SRC        = 1U << 17,
+       PERF_OUTPUT_WEIGHT          = 1U << 18,
+       PERF_OUTPUT_BPF_OUTPUT      = 1U << 19,
  };
  
  struct output_option {
@@ -81,6 +85,9 @@ struct output_option {
         {.str = "iregs", .field = PERF_OUTPUT_IREGS},
         {.str = "brstack", .field = PERF_OUTPUT_BRSTACK},
         {.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
+       {.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
+       {.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
+       {.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
  };
  
  /* default set to maintain compatibility with current format */
@@ -101,7 +108,7 @@ static struct {
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                               PERF_OUTPUT_PERIOD,
  
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
         },
  
         [PERF_TYPE_SOFTWARE] = {
@@ -111,7 +118,7 @@ static struct {
                               PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
                               PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-                             PERF_OUTPUT_PERIOD,
+                             PERF_OUTPUT_PERIOD | PERF_OUTPUT_BPF_OUTPUT,
  
                 .invalid_fields = PERF_OUTPUT_TRACE,
         },
@@ -121,7 +128,7 @@ static struct {
  
                 .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
                                   PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
-                                 PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+                                 PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE
         },
  
         [PERF_TYPE_RAW] = {
@@ -131,9 +138,10 @@ static struct {
                               PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
                               PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-                             PERF_OUTPUT_PERIOD,
+                             PERF_OUTPUT_PERIOD |  PERF_OUTPUT_ADDR |
+                             PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT,
  
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
         },
  
         [PERF_TYPE_BREAKPOINT] = {
@@ -145,7 +153,7 @@ static struct {
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                               PERF_OUTPUT_PERIOD,
  
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
         },
  };
  
@@ -242,6 +250,16 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
                                            PERF_OUTPUT_ADDR, allow_user_set))
                 return -EINVAL;
  
+       if (PRINT_FIELD(DATA_SRC) &&
+               perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC",
+                                       PERF_OUTPUT_DATA_SRC))
+               return -EINVAL;
+
+       if (PRINT_FIELD(WEIGHT) &&
+               perf_evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT",
+                                       PERF_OUTPUT_WEIGHT))
+               return -EINVAL;
+
         if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
                 pr_err("Display of symbols requested but neither sample IP nor "
                            "sample address\nis selected. Hence, no addresses to convert "
@@ -608,6 +626,84 @@ static void print_sample_flags(u32 flags)
         printf("  %-4s ", str);
  }
  
+struct printer_data {
+       int line_no;
+       bool hit_nul;
+       bool is_printable;
+};
+
+static void
+print_sample_bpf_output_printer(enum binary_printer_ops op,
+                               unsigned int val,
+                               void *extra)
+{
+       unsigned char ch = (unsigned char)val;
+       struct printer_data *printer_data = extra;
+
+       switch (op) {
+       case BINARY_PRINT_DATA_BEGIN:
+               printf("\n");
+               break;
+       case BINARY_PRINT_LINE_BEGIN:
+               printf("%17s", !printer_data->line_no ? "BPF output:" :
+                                                       "           ");
+               break;
+       case BINARY_PRINT_ADDR:
+               printf(" %04x:", val);
+               break;
+       case BINARY_PRINT_NUM_DATA:
+               printf(" %02x", val);
+               break;
+       case BINARY_PRINT_NUM_PAD:
+               printf("   ");
+               break;
+       case BINARY_PRINT_SEP:
+               printf("  ");
+               break;
+       case BINARY_PRINT_CHAR_DATA:
+               if (printer_data->hit_nul && ch)
+                       printer_data->is_printable = false;
+
+               if (!isprint(ch)) {
+                       printf("%c", '.');
+
+                       if (!printer_data->is_printable)
+                               break;
+
+                       if (ch == '\0')
+                               printer_data->hit_nul = true;
+                       else
+                               printer_data->is_printable = false;
+               } else {
+                       printf("%c", ch);
+               }
+               break;
+       case BINARY_PRINT_CHAR_PAD:
+               printf(" ");
+               break;
+       case BINARY_PRINT_LINE_END:
+               printf("\n");
+               printer_data->line_no++;
+               break;
+       case BINARY_PRINT_DATA_END:
+       default:
+               break;
+       }
+}
+
+static void print_sample_bpf_output(struct perf_sample *sample)
+{
+       unsigned int nr_bytes = sample->raw_size;
+       struct printer_data printer_data = {0, false, true};
+
+       print_binary(sample->raw_data, nr_bytes, 8,
+                    print_sample_bpf_output_printer, &printer_data);
+
+       if (printer_data.is_printable && printer_data.hit_nul)
+               printf("%17s \"%s\"\n", "BPF string:",
+                      (char *)(sample->raw_data));
+}
+
  struct perf_script {
         struct perf_tool        tool;
         struct perf_session     *session;
@@ -634,6 +730,23 @@ static int perf_evlist__max_name_len(struct perf_evlist *evlist)
         return max;
  }
  
+static size_t data_src__printf(u64 data_src)
+{
+       struct mem_info mi = { .data_src.val = data_src };
+       char decode[100];
+       char out[100];
+       static int maxlen;
+       int len;
+
+       perf_script__meminfo_scnprintf(decode, 100, &mi);
+
+       len = scnprintf(out, 100, "%16" PRIx64 " %s", data_src, decode);
+       if (maxlen < len)
+               maxlen = len;
+
+       return printf("%-*s", maxlen, out);
+}
+
  static void process_event(struct perf_script *script, union perf_event *event,
                           struct perf_sample *sample, struct perf_evsel *evsel,
                           struct addr_location *al)
@@ -673,6 +786,12 @@ static void process_event(struct perf_script *script, union perf_event *event,
         if (PRINT_FIELD(ADDR))
                 print_sample_addr(event, sample, thread, attr);
  
+       if (PRINT_FIELD(DATA_SRC))
+               data_src__printf(sample->data_src);
+
+       if (PRINT_FIELD(WEIGHT))
+               printf("%16" PRIu64, sample->weight);
+
         if (PRINT_FIELD(IP)) {
                 if (!symbol_conf.use_callchain)
                         printf(" ");
@@ -692,6 +811,9 @@ static void process_event(struct perf_script *script, union perf_event *event,
         else if (PRINT_FIELD(BRSTACKSYM))
                 print_sample_brstacksym(event, sample, thread, attr);
  
+       if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
+               print_sample_bpf_output(sample);
+
         printf("\n");
  }
  
@@ -1090,23 +1212,6 @@ static struct script_spec *script_spec__find(const char *spec)
         return NULL;
  }
  
-static struct script_spec *script_spec__findnew(const char *spec,
-                                               struct scripting_ops *ops)
-{
-       struct script_spec *s = script_spec__find(spec);
-
-       if (s)
-               return s;
-
-       s = script_spec__new(spec, ops);
-       if (!s)
-               return NULL;
-
-       script_spec__add(s);
-
-       return s;
-}
-
  int script_spec_register(const char *spec, struct scripting_ops *ops)
  {
         struct script_spec *s;
@@ -1115,9 +1220,11 @@ int script_spec_register(const char *spec, struct scripting_ops *ops)
         if (s)
                 return -1;
  
-       s = script_spec__findnew(spec, ops);
+       s = script_spec__new(spec, ops);
         if (!s)
                 return -1;
+       else
+               script_spec__add(s);
  
         return 0;
  }
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index 038e877081b682dd8d9ba052a0be66c8ac2786c6..1f19f2f999c841b9da140e10bcaf5e6e0f41ee6b 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -122,6 +122,7 @@ static bool                 sync_run                        = false;
  static unsigned int            initial_delay                   = 0;
  static unsigned int            unit_width                      = 4; /* strlen("unit") */
  static bool                    forever                         = false;
+static bool                    metric_only                     = false;
  static struct timespec         ref_time;
  static struct cpu_map          *aggr_map;
  static aggr_get_id_t           aggr_get_id;
@@ -735,6 +736,191 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
         }
  }
  
+struct outstate {
+       FILE *fh;
+       bool newline;
+       const char *prefix;
+       int  nfields;
+       int  id, nr;
+       struct perf_evsel *evsel;
+};
+
+#define METRIC_LEN  35
+
+static void new_line_std(void *ctx)
+{
+       struct outstate *os = ctx;
+
+       os->newline = true;
+}
+
+static void do_new_line_std(struct outstate *os)
+{
+       fputc('\n', os->fh);
+       fputs(os->prefix, os->fh);
+       aggr_printout(os->evsel, os->id, os->nr);
+       if (stat_config.aggr_mode == AGGR_NONE)
+               fprintf(os->fh, "        ");
+       fprintf(os->fh, "                                                 ");
+}
+
+static void print_metric_std(void *ctx, const char *color, const char *fmt,
+                            const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       int n;
+       bool newline = os->newline;
+
+       os->newline = false;
+
+       if (unit == NULL || fmt == NULL) {
+               fprintf(out, "%-*s", METRIC_LEN, "");
+               return;
+       }
+
+       if (newline)
+               do_new_line_std(os);
+
+       n = fprintf(out, " # ");
+       if (color)
+               n += color_fprintf(out, color, fmt, val);
+       else
+               n += fprintf(out, fmt, val);
+       fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
+}
+
+static void new_line_csv(void *ctx)
+{
+       struct outstate *os = ctx;
+       int i;
+
+       fputc('\n', os->fh);
+       if (os->prefix)
+               fprintf(os->fh, "%s%s", os->prefix, csv_sep);
+       aggr_printout(os->evsel, os->id, os->nr);
+       for (i = 0; i < os->nfields; i++)
+               fputs(csv_sep, os->fh);
+}
+
+static void print_metric_csv(void *ctx,
+                            const char *color __maybe_unused,
+                            const char *fmt, const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       char buf[64], *vals, *ends;
+
+       if (unit == NULL || fmt == NULL) {
+               fprintf(out, "%s%s%s%s", csv_sep, csv_sep, csv_sep, csv_sep);
+               return;
+       }
+       snprintf(buf, sizeof(buf), fmt, val);
+       vals = buf;
+       while (isspace(*vals))
+               vals++;
+       ends = vals;
+       while (isdigit(*ends) || *ends == '.')
+               ends++;
+       *ends = 0;
+       while (isspace(*unit))
+               unit++;
+       fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit);
+}
+
+#define METRIC_ONLY_LEN 20
+
+/* Filter out some columns that don't work well in metrics only mode */
+
+static bool valid_only_metric(const char *unit)
+{
+       if (!unit)
+               return false;
+       if (strstr(unit, "/sec") ||
+           strstr(unit, "hz") ||
+           strstr(unit, "Hz") ||
+           strstr(unit, "CPUs utilized"))
+               return false;
+       return true;
+}
+
+static const char *fixunit(char *buf, struct perf_evsel *evsel,
+                          const char *unit)
+{
+       if (!strncmp(unit, "of all", 6)) {
+               snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel),
+                        unit);
+               return buf;
+       }
+       return unit;
+}
+
+static void print_metric_only(void *ctx, const char *color, const char *fmt,
+                             const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       int n;
+       char buf[1024];
+       unsigned mlen = METRIC_ONLY_LEN;
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(buf, os->evsel, unit);
+       if (color)
+               n = color_fprintf(out, color, fmt, val);
+       else
+               n = fprintf(out, fmt, val);
+       if (n > METRIC_ONLY_LEN)
+               n = METRIC_ONLY_LEN;
+       if (mlen < strlen(unit))
+               mlen = strlen(unit) + 1;
+       fprintf(out, "%*s", mlen - n, "");
+}
+
+static void print_metric_only_csv(void *ctx, const char *color __maybe_unused,
+                                 const char *fmt,
+                                 const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       char buf[64], *vals, *ends;
+       char tbuf[1024];
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(tbuf, os->evsel, unit);
+       snprintf(buf, sizeof buf, fmt, val);
+       vals = buf;
+       while (isspace(*vals))
+               vals++;
+       ends = vals;
+       while (isdigit(*ends) || *ends == '.')
+               ends++;
+       *ends = 0;
+       fprintf(out, "%s%s", vals, csv_sep);
+}
+
+static void new_line_metric(void *ctx __maybe_unused)
+{
+}
+
+static void print_metric_header(void *ctx, const char *color __maybe_unused,
+                               const char *fmt __maybe_unused,
+                               const char *unit, double val __maybe_unused)
+{
+       struct outstate *os = ctx;
+       char tbuf[1024];
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(tbuf, os->evsel, unit);
+       if (csv_output)
+               fprintf(os->fh, "%s%s", unit, csv_sep);
+       else
+               fprintf(os->fh, "%-*s ", METRIC_ONLY_LEN, unit);
+}
+
  static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
  {
         FILE *output = stat_config.output;
@@ -763,6 +949,28 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
  }
  
+static int first_shadow_cpu(struct perf_evsel *evsel, int id)
+{
+       int i;
+
+       if (!aggr_get_id)
+               return 0;
+
+       if (stat_config.aggr_mode == AGGR_NONE)
+               return id;
+
+       if (stat_config.aggr_mode == AGGR_GLOBAL)
+               return 0;
+
+       for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
+               int cpu2 = perf_evsel__cpus(evsel)->map[i];
+
+               if (aggr_get_id(evsel_list->cpus, cpu2) == id)
+                       return cpu2;
+       }
+       return 0;
+}
+
  static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
  {
         FILE *output = stat_config.output;
@@ -793,22 +1001,124 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
  }
  
-static void printout(int id, int nr, struct perf_evsel *counter, double uval)
+static void printout(int id, int nr, struct perf_evsel *counter, double uval,
+                    char *prefix, u64 run, u64 ena, double noise)
  {
-       int cpu = cpu_map__id_to_cpu(id);
+       struct perf_stat_output_ctx out;
+       struct outstate os = {
+               .fh = stat_config.output,
+               .prefix = prefix ? prefix : "",
+               .id = id,
+               .nr = nr,
+               .evsel = counter,
+       };
+       print_metric_t pm = print_metric_std;
+       void (*nl)(void *);
  
-       if (stat_config.aggr_mode == AGGR_GLOBAL)
-               cpu = 0;
+       if (metric_only) {
+               nl = new_line_metric;
+               if (csv_output)
+                       pm = print_metric_only_csv;
+               else
+                       pm = print_metric_only;
+       } else
+               nl = new_line_std;
+
+       if (csv_output && !metric_only) {
+               static int aggr_fields[] = {
+                       [AGGR_GLOBAL] = 0,
+                       [AGGR_THREAD] = 1,
+                       [AGGR_NONE] = 1,
+                       [AGGR_SOCKET] = 2,
+                       [AGGR_CORE] = 2,
+               };
+
+               pm = print_metric_csv;
+               nl = new_line_csv;
+               os.nfields = 3;
+               os.nfields += aggr_fields[stat_config.aggr_mode];
+               if (counter->cgrp)
+                       os.nfields++;
+       }
+       if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
+               if (metric_only) {
+                       pm(&os, NULL, "", "", 0);
+                       return;
+               }
+               aggr_printout(counter, id, nr);
+
+               fprintf(stat_config.output, "%*s%s",
+                       csv_output ? 0 : 18,
+                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+                       csv_sep);
+
+               fprintf(stat_config.output, "%-*s%s",
+                       csv_output ? 0 : unit_width,
+                       counter->unit, csv_sep);
+
+               fprintf(stat_config.output, "%*s",
+                       csv_output ? 0 : -25,
+                       perf_evsel__name(counter));
+
+               if (counter->cgrp)
+                       fprintf(stat_config.output, "%s%s",
+                               csv_sep, counter->cgrp->name);
  
-       if (nsec_counter(counter))
+               if (!csv_output)
+                       pm(&os, NULL, NULL, "", 0);
+               print_noise(counter, noise);
+               print_running(run, ena);
+               if (csv_output)
+                       pm(&os, NULL, NULL, "", 0);
+               return;
+       }
+
+       if (metric_only)
+               /* nothing */;
+       else if (nsec_counter(counter))
                 nsec_printout(id, nr, counter, uval);
         else
                 abs_printout(id, nr, counter, uval);
  
-       if (!csv_output && !stat_config.interval)
-               perf_stat__print_shadow_stats(stat_config.output, counter,
-                                             uval, cpu,
-                                             stat_config.aggr_mode);
+       out.print_metric = pm;
+       out.new_line = nl;
+       out.ctx = &os;
+
+       if (csv_output && !metric_only) {
+               print_noise(counter, noise);
+               print_running(run, ena);
+       }
+
+       perf_stat__print_shadow_stats(counter, uval,
+                               first_shadow_cpu(counter, id),
+                               &out);
+       if (!csv_output && !metric_only) {
+               print_noise(counter, noise);
+               print_running(run, ena);
+       }
+}
+
+static void aggr_update_shadow(void)
+{
+       int cpu, s2, id, s;
+       u64 val;
+       struct perf_evsel *counter;
+
+       for (s = 0; s < aggr_map->nr; s++) {
+               id = aggr_map->map[s];
+               evlist__for_each(evsel_list, counter) {
+                       val = 0;
+                       for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+                               s2 = aggr_get_id(evsel_list->cpus, cpu);
+                               if (s2 != id)
+                                       continue;
+                               val += perf_counts(counter->counts, cpu, 0)->val;
+                       }
+                       val = val * counter->scale;
+                       perf_stat__update_shadow_stats(counter, &val,
+                                                      first_shadow_cpu(counter, id));
+               }
+       }
  }
  
  static void print_aggr(char *prefix)
@@ -818,12 +1128,23 @@ static void print_aggr(char *prefix)
         int cpu, s, s2, id, nr;
         double uval;
         u64 ena, run, val;
+       bool first;
  
         if (!(aggr_map || aggr_get_id))
                 return;
  
+       aggr_update_shadow();
+
+       /*
+        * With metric_only everything is on a single line.
+        * Without each counter has its own line.
+        */
         for (s = 0; s < aggr_map->nr; s++) {
+               if (prefix && metric_only)
+                       fprintf(output, "%s", prefix);
+
                 id = aggr_map->map[s];
+               first = true;
                 evlist__for_each(evsel_list, counter) {
                         val = ena = run = 0;
                         nr = 0;
@@ -836,41 +1157,20 @@ static void print_aggr(char *prefix)
                                 run += perf_counts(counter->counts, cpu, 0)->run;
                                 nr++;
                         }
-                       if (prefix)
-                               fprintf(output, "%s", prefix);
-
-                       if (run == 0 || ena == 0) {
+                       if (first && metric_only) {
+                               first = false;
                                 aggr_printout(counter, id, nr);
-
-                               fprintf(output, "%*s%s",
-                                       csv_output ? 0 : 18,
-                                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                                       csv_sep);
-
-                               fprintf(output, "%-*s%s",
-                                       csv_output ? 0 : unit_width,
-                                       counter->unit, csv_sep);
-
-                               fprintf(output, "%*s",
-                                       csv_output ? 0 : -25,
-                                       perf_evsel__name(counter));
-
-                               if (counter->cgrp)
-                                       fprintf(output, "%s%s",
-                                               csv_sep, counter->cgrp->name);
-
-                               print_running(run, ena);
-                               fputc('\n', output);
-                               continue;
                         }
-                       uval = val * counter->scale;
-                       printout(id, nr, counter, uval);
-                       if (!csv_output)
-                               print_noise(counter, 1.0);
+                       if (prefix && !metric_only)
+                               fprintf(output, "%s", prefix);
  
-                       print_running(run, ena);
-                       fputc('\n', output);
+                       uval = val * counter->scale;
+                       printout(id, nr, counter, uval, prefix, run, ena, 1.0);
+                       if (!metric_only)
+                               fputc('\n', output);
                 }
+               if (metric_only)
+                       fputc('\n', output);
         }
  }
  
@@ -895,12 +1195,7 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
                         fprintf(output, "%s", prefix);
  
                 uval = val * counter->scale;
-               printout(thread, 0, counter, uval);
-
-               if (!csv_output)
-                       print_noise(counter, 1.0);
-
-               print_running(run, ena);
+               printout(thread, 0, counter, uval, prefix, run, ena, 1.0);
                 fputc('\n', output);
         }
  }
@@ -914,43 +1209,19 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
         FILE *output = stat_config.output;
         struct perf_stat_evsel *ps = counter->priv;
         double avg = avg_stats(&ps->res_stats[0]);
-       int scaled = counter->counts->scaled;
         double uval;
         double avg_enabled, avg_running;
  
         avg_enabled = avg_stats(&ps->res_stats[1]);
         avg_running = avg_stats(&ps->res_stats[2]);
  
-       if (prefix)
+       if (prefix && !metric_only)
                 fprintf(output, "%s", prefix);
  
-       if (scaled == -1 || !counter->supported) {
-               fprintf(output, "%*s%s",
-                       csv_output ? 0 : 18,
-                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                       csv_sep);
-               fprintf(output, "%-*s%s",
-                       csv_output ? 0 : unit_width,
-                       counter->unit, csv_sep);
-               fprintf(output, "%*s",
-                       csv_output ? 0 : -25,
-                       perf_evsel__name(counter));
-
-               if (counter->cgrp)
-                       fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
-
-               print_running(avg_running, avg_enabled);
-               fputc('\n', output);
-               return;
-       }
-
         uval = avg * counter->scale;
-       printout(-1, 0, counter, uval);
-
-       print_noise(counter, avg);
-
-       print_running(avg_running, avg_enabled);
-       fprintf(output, "\n");
+       printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
+       if (!metric_only)
+               fprintf(output, "\n");
  }
  
  /*
@@ -972,39 +1243,78 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
                 if (prefix)
                         fprintf(output, "%s", prefix);
  
-               if (run == 0 || ena == 0) {
-                       fprintf(output, "CPU%*d%s%*s%s",
-                               csv_output ? 0 : -4,
-                               perf_evsel__cpus(counter)->map[cpu], csv_sep,
-                               csv_output ? 0 : 18,
-                               counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                               csv_sep);
+               uval = val * counter->scale;
+               printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
  
-                               fprintf(output, "%-*s%s",
-                                       csv_output ? 0 : unit_width,
-                                       counter->unit, csv_sep);
+               fputc('\n', output);
+       }
+}
  
-                               fprintf(output, "%*s",
-                                       csv_output ? 0 : -25,
-                                       perf_evsel__name(counter));
+static void print_no_aggr_metric(char *prefix)
+{
+       int cpu;
+       int nrcpus = 0;
+       struct perf_evsel *counter;
+       u64 ena, run, val;
+       double uval;
  
-                       if (counter->cgrp)
-                               fprintf(output, "%s%s",
-                                       csv_sep, counter->cgrp->name);
+       nrcpus = evsel_list->cpus->nr;
+       for (cpu = 0; cpu < nrcpus; cpu++) {
+               bool first = true;
  
-                       print_running(run, ena);
-                       fputc('\n', output);
-                       continue;
+               if (prefix)
+                       fputs(prefix, stat_config.output);
+               evlist__for_each(evsel_list, counter) {
+                       if (first) {
+                               aggr_printout(counter, cpu, 0);
+                               first = false;
+                       }
+                       val = perf_counts(counter->counts, cpu, 0)->val;
+                       ena = perf_counts(counter->counts, cpu, 0)->ena;
+                       run = perf_counts(counter->counts, cpu, 0)->run;
+
+                       uval = val * counter->scale;
+                       printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
                 }
+               fputc('\n', stat_config.output);
+       }
+}
  
-               uval = val * counter->scale;
-               printout(cpu, 0, counter, uval);
-               if (!csv_output)
-                       print_noise(counter, 1.0);
-               print_running(run, ena);
+static int aggr_header_lens[] = {
+       [AGGR_CORE] = 18,
+       [AGGR_SOCKET] = 12,
+       [AGGR_NONE] = 6,
+       [AGGR_THREAD] = 24,
+       [AGGR_GLOBAL] = 0,
+};
  
-               fputc('\n', output);
+static void print_metric_headers(char *prefix)
+{
+       struct perf_stat_output_ctx out;
+       struct perf_evsel *counter;
+       struct outstate os = {
+               .fh = stat_config.output
+       };
+
+       if (prefix)
+               fprintf(stat_config.output, "%s", prefix);
+
+       if (!csv_output)
+               fprintf(stat_config.output, "%*s",
+                       aggr_header_lens[stat_config.aggr_mode], "");
+
+       /* Print metrics headers only */
+       evlist__for_each(evsel_list, counter) {
+               os.evsel = counter;
+               out.ctx = &os;
+               out.print_metric = print_metric_header;
+               out.new_line = new_line_metric;
+               os.evsel = counter;
+               perf_stat__print_shadow_stats(counter, 0,
+                                             0,
+                                             &out);
         }
+       fputc('\n', stat_config.output);
  }
  
  static void print_interval(char *prefix, struct timespec *ts)
@@ -1014,7 +1324,7 @@ static void print_interval(char *prefix, struct timespec *ts)
  
         sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
  
-       if (num_print_interval == 0 && !csv_output) {
+       if (num_print_interval == 0 && !csv_output && !metric_only) {
                 switch (stat_config.aggr_mode) {
                 case AGGR_SOCKET:
                         fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
@@ -1101,6 +1411,17 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
         else
                 print_header(argc, argv);
  
+       if (metric_only) {
+               static int num_print_iv;
+
+               if (num_print_iv == 0)
+                       print_metric_headers(prefix);
+               if (num_print_iv++ == 25)
+                       num_print_iv = 0;
+               if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
+                       fprintf(stat_config.output, "%s", prefix);
+       }
+
         switch (stat_config.aggr_mode) {
         case AGGR_CORE:
         case AGGR_SOCKET:
@@ -1113,10 +1434,16 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
         case AGGR_GLOBAL:
                 evlist__for_each(evsel_list, counter)
                         print_counter_aggr(counter, prefix);
+               if (metric_only)
+                       fputc('\n', stat_config.output);
                 break;
         case AGGR_NONE:
-               evlist__for_each(evsel_list, counter)
-                       print_counter(counter, prefix);
+               if (metric_only)
+                       print_no_aggr_metric(prefix);
+               else {
+                       evlist__for_each(evsel_list, counter)
+                               print_counter(counter, prefix);
+               }
                 break;
         case AGGR_UNSET:
         default:
@@ -1237,6 +1564,8 @@ static const struct option stat_options[] = {
                      "aggregate counts per thread", AGGR_THREAD),
         OPT_UINTEGER('D', "delay", &initial_delay,
                      "ms to wait before starting measurement after program start"),
+       OPT_BOOLEAN(0, "metric-only", &metric_only,
+                       "Only print computed metrics. No raw values"),
         OPT_END()
  };
  
@@ -1435,7 +1764,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
   */
  static int add_default_attributes(void)
  {
-       struct perf_event_attr default_attrs[] = {
+       struct perf_event_attr default_attrs0[] = {
  
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES       },
@@ -1443,8 +1772,14 @@ static int add_default_attributes(void)
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS            },
  
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES             },
+};
+       struct perf_event_attr frontend_attrs[] = {
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND        },
+};
+       struct perf_event_attr backend_attrs[] = {
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+       struct perf_event_attr default_attrs1[] = {
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS           },
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS    },
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES          },
@@ -1561,7 +1896,19 @@ static int add_default_attributes(void)
         }
  
         if (!evsel_list->nr_entries) {
-               if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
+               if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
+                       return -1;
+               if (pmu_have_event("cpu", "stalled-cycles-frontend")) {
+                       if (perf_evlist__add_default_attrs(evsel_list,
+                                               frontend_attrs) < 0)
+                               return -1;
+               }
+               if (pmu_have_event("cpu", "stalled-cycles-backend")) {
+                       if (perf_evlist__add_default_attrs(evsel_list,
+                                               backend_attrs) < 0)
+                               return -1;
+               }
+               if (perf_evlist__add_default_attrs(evsel_list, default_attrs1) < 0)
                         return -1;
         }
  
@@ -1825,9 +2172,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
         if (evsel_list == NULL)
                 return -ENOMEM;
  
+       parse_events__shrink_config_terms();
         argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
                                         (const char **) stat_usage,
                                         PARSE_OPT_STOP_AT_NON_OPTION);
+       perf_stat__init_shadow_stats();
  
         if (csv_sep) {
                 csv_output = true;
@@ -1858,6 +2207,16 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
                 goto out;
         }
  
+       if (metric_only && stat_config.aggr_mode == AGGR_THREAD) {
+               fprintf(stderr, "--metric-only is not supported with --per-thread\n");
+               goto out;
+       }
+
+       if (metric_only && run_count > 1) {
+               fprintf(stderr, "--metric-only is not supported with -r\n");
+               goto out;
+       }
+
         if (output_fd < 0) {
                 fprintf(stderr, "argument to --log-fd must be a > 0\n");
                 parse_options_usage(stat_usage, stat_options, "log-fd", 0);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index bf01cbb0ef2369f2fc904809b86b70a823bdb604..94af190f6843d2c92e68b2dfef8dc51017c42079 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -252,7 +252,8 @@ static void perf_top__print_sym_table(struct perf_top *top)
         char bf[160];
         int printed = 0;
         const int win_width = top->winsize.ws_col - 1;
-       struct hists *hists = evsel__hists(top->sym_evsel);
+       struct perf_evsel *evsel = top->sym_evsel;
+       struct hists *hists = evsel__hists(evsel);
  
         puts(CONSOLE_CLEAR);
  
@@ -288,7 +289,7 @@ static void perf_top__print_sym_table(struct perf_top *top)
         }
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         hists__output_recalc_col_len(hists, top->print_entries - printed);
         putchar('\n');
@@ -540,6 +541,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c)
  static void perf_top__sort_new_samples(void *arg)
  {
         struct perf_top *t = arg;
+       struct perf_evsel *evsel = t->sym_evsel;
         struct hists *hists;
  
         perf_top__reset_sample_counters(t);
@@ -547,7 +549,7 @@ static void perf_top__sort_new_samples(void *arg)
         if (t->evlist->selected != NULL)
                 t->sym_evsel = t->evlist->selected;
  
-       hists = evsel__hists(t->sym_evsel);
+       hists = evsel__hists(evsel);
  
         if (t->evlist->enabled) {
                 if (t->zero) {
@@ -559,7 +561,7 @@ static void perf_top__sort_new_samples(void *arg)
         }
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  }
  
  static void *display_thread_tui(void *arg)
@@ -1063,7 +1065,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset)
         return parse_callchain_top_opt(arg);
  }
  
-static int perf_top_config(const char *var, const char *value, void *cb)
+static int perf_top_config(const char *var, const char *value, void *cb __maybe_unused)
  {
         if (!strcmp(var, "top.call-graph"))
                 var = "call-graph.record-mode"; /* fall-through */
@@ -1072,7 +1074,7 @@ static int perf_top_config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int
@@ -1212,6 +1214,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                      parse_branch_stack),
         OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
                     "Show raw trace event output (do not use print fmt or plugins)"),
+       OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+                   "Show entries in a hierarchy"),
         OPT_END()
         };
         const char * const top_usage[] = {
@@ -1239,10 +1243,30 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                 goto out_delete_evlist;
         }
  
+       if (symbol_conf.report_hierarchy) {
+               /* disable incompatible options */
+               symbol_conf.event_group = false;
+               symbol_conf.cumulate_callchain = false;
+
+               if (field_order) {
+                       pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+                       parse_options_usage(top_usage, options, "fields", 0);
+                       parse_options_usage(NULL, options, "hierarchy", 0);
+                       goto out_delete_evlist;
+               }
+       }
+
         sort__mode = SORT_MODE__TOP;
         /* display thread wants entries to be collapsed in a different tree */
         sort__need_collapse = 1;
  
+       if (top.use_stdio)
+               use_browser = 0;
+       else if (top.use_tui)
+               use_browser = 1;
+
+       setup_browser(false);
+
         if (setup_sorting(top.evlist) < 0) {
                 if (sort_order)
                         parse_options_usage(top_usage, options, "s", 1);
@@ -1252,13 +1276,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                 goto out_delete_evlist;
         }
  
-       if (top.use_stdio)
-               use_browser = 0;
-       else if (top.use_tui)
-               use_browser = 1;
-
-       setup_browser(false);
-
         status = target__validate(target);
         if (status) {
                 target__strerror(target, status, errbuf, BUFSIZ);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index 20916dd77aac24847bffbaa5d1c0fb56580d6e39..8dc98c598b1aed734ab6a6695c7308f3c16e0910 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -33,6 +33,7 @@
  #include "util/stat.h"
  #include "trace-event.h"
  #include "util/parse-events.h"
+#include "util/bpf-loader.h"
  
  #include <libaudit.h>
  #include <stdlib.h>
@@ -1724,8 +1725,12 @@ static int trace__read_syscall_info(struct trace *trace, int id)
  
         sc->args = sc->tp_format->format.fields;
         sc->nr_args = sc->tp_format->format.nr_fields;
-       /* drop nr field - not relevant here; does not exist on older kernels */
-       if (sc->args && strcmp(sc->args->name, "nr") == 0) {
+       /*
+        * We need to check and discard the first variable '__syscall_nr'
+        * or 'nr' that mean the syscall number. It is needless here.
+        * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
+        */
+       if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
                 sc->args = sc->args->next;
                 --sc->nr_args;
         }
@@ -2177,6 +2182,37 @@ out_dump:
         return 0;
  }
  
+static void bpf_output__printer(enum binary_printer_ops op,
+                               unsigned int val, void *extra)
+{
+       FILE *output = extra;
+       unsigned char ch = (unsigned char)val;
+
+       switch (op) {
+       case BINARY_PRINT_CHAR_DATA:
+               fprintf(output, "%c", isprint(ch) ? ch : '.');
+               break;
+       case BINARY_PRINT_DATA_BEGIN:
+       case BINARY_PRINT_LINE_BEGIN:
+       case BINARY_PRINT_ADDR:
+       case BINARY_PRINT_NUM_DATA:
+       case BINARY_PRINT_NUM_PAD:
+       case BINARY_PRINT_SEP:
+       case BINARY_PRINT_CHAR_PAD:
+       case BINARY_PRINT_LINE_END:
+       case BINARY_PRINT_DATA_END:
+       default:
+               break;
+       }
+}
+
+static void bpf_output__fprintf(struct trace *trace,
+                               struct perf_sample *sample)
+{
+       print_binary(sample->raw_data, sample->raw_size, 8,
+                    bpf_output__printer, trace->output);
+}
+
  static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
                                 union perf_event *event __maybe_unused,
                                 struct perf_sample *sample)
@@ -2189,7 +2225,9 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
  
         fprintf(trace->output, "%s:", evsel->name);
  
-       if (evsel->tp_format) {
+       if (perf_evsel__is_bpf_output(evsel)) {
+               bpf_output__fprintf(trace, sample);
+       } else if (evsel->tp_format) {
                 event_format__fprintf(evsel->tp_format, sample->cpu,
                                       sample->raw_data, sample->raw_size,
                                       trace->output);
@@ -2586,6 +2624,16 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
         if (err < 0)
                 goto out_error_open;
  
+       err = bpf__apply_obj_config();
+       if (err) {
+               char errbuf[BUFSIZ];
+
+               bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+               pr_err("ERROR: Apply config to BPF failed: %s\n",
+                        errbuf);
+               goto out_error_open;
+       }
+
         /*
          * Better not use !target__has_task() here because we need to cover the
          * case where no threads were specified in the command line, but a
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index 511141b102e8464fe3e102007f792220306ad3b9..eca6a912e8c22b0df342a7d955c6bad2db5ddf47 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -61,50 +61,45 @@ endif
  
  ifeq ($(LIBUNWIND_LIBS),)
    NO_LIBUNWIND := 1
-else
-  #
-  # For linking with debug library, run like:
-  #
-  #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
-  #
-  ifdef LIBUNWIND_DIR
-    LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
-    LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
-  endif
-  LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
-
-  # Set per-feature check compilation flags
-  FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
-  FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
  endif
+#
+# For linking with debug library, run like:
+#
+#   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+#
+ifdef LIBUNWIND_DIR
+  LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
+  LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
+endif
+LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
+
+# Set per-feature check compilation flags
+FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
+FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
  
  ifeq ($(NO_PERF_REGS),0)
    CFLAGS += -DHAVE_PERF_REGS_SUPPORT
  endif
  
-ifndef NO_LIBELF
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBDW_DIR=/opt/libdw/
-  ifdef LIBDW_DIR
-    LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
-    LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
+# for linking with debug library, run like:
+# make DEBUG=1 LIBDW_DIR=/opt/libdw/
+ifdef LIBDW_DIR
+  LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
+  LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
  endif
+FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
  
-ifdef LIBBABELTRACE
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
-  ifdef LIBBABELTRACE_DIR
-    LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
-    LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
+# for linking with debug library, run like:
+# make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
+ifdef LIBBABELTRACE_DIR
+  LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
+  LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
  endif
+FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
  
  FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi
  # include ARCH specific config
@@ -145,28 +140,26 @@ ifdef PARSER_DEBUG
    $(call detected_var,PARSER_DEBUG_FLEX)
  endif
  
-ifndef NO_LIBPYTHON
-  # Try different combinations to accommodate systems that only have
-  # python[2][-config] in weird combinations but always preferring
-  # python2 and python2-config as per pep-0394. If we catch a
-  # python[-config] in version 3, the version check will kill it.
-  PYTHON2 := $(if $(call get-executable,python2),python2,python)
-  override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
-  PYTHON2_CONFIG := \
-    $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
-  override PYTHON_CONFIG := \
-    $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
+# Try different combinations to accommodate systems that only have
+# python[2][-config] in weird combinations but always preferring
+# python2 and python2-config as per pep-0394. If we catch a
+# python[-config] in version 3, the version check will kill it.
+PYTHON2 := $(if $(call get-executable,python2),python2,python)
+override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
+PYTHON2_CONFIG := \
+  $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
+override PYTHON_CONFIG := \
+  $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
  
-  PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
  
-  PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-  PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
  
-  FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
-  FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
-endif
+FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
+FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
  
  CFLAGS += -fno-omit-frame-pointer
  CFLAGS += -ggdb3
@@ -335,6 +328,13 @@ ifndef NO_LIBELF
    endif # NO_LIBBPF
  endif # NO_LIBELF
  
+ifdef PERF_HAVE_JITDUMP
+  ifndef NO_DWARF
+    $(call detected,CONFIG_JITDUMP)
+    CFLAGS += -DHAVE_JITDUMP
+  endif
+endif
+
  ifeq ($(ARCH),powerpc)
    ifndef NO_DWARF
      CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX
@@ -411,6 +411,17 @@ ifndef NO_LIBAUDIT
    endif
  endif
  
+ifndef NO_LIBCRYPTO
+  ifneq ($(feature-libcrypto), 1)
+    msg := $(warning No libcrypto.h found, disables jitted code injection, please install libssl-devel or libssl-dev);
+    NO_LIBCRYPTO := 1
+  else
+    CFLAGS += -DHAVE_LIBCRYPTO_SUPPORT
+    EXTLIBS += -lcrypto
+    $(call detected,CONFIG_CRYPTO)
+  endif
+endif
+
  ifdef NO_NEWT
    NO_SLANG=1
  endif
diff --git a/tools/perf/jvmti/Makefile b/tools/perf/jvmti/Makefile

new file mode 100644 (file)

index 0000000..5ce61a1
--- /dev/null
+++ b/tools/perf/jvmti/Makefile
@@ -0,0 +1,89 @@
+ARCH=$(shell uname -m)
+
+ifeq ($(ARCH), x86_64)
+JARCH=amd64
+endif
+ifeq ($(ARCH), armv7l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), armv6l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), aarch64)
+JARCH=aarch64
+endif
+ifeq ($(ARCH), ppc64)
+JARCH=powerpc
+endif
+ifeq ($(ARCH), ppc64le)
+JARCH=powerpc
+endif
+
+DESTDIR=/usr/local
+
+VERSION=1
+REVISION=0
+AGE=0
+
+LN=ln -sf
+RM=rm
+
+SLIBJVMTI=libjvmti.so.$(VERSION).$(REVISION).$(AGE)
+VLIBJVMTI=libjvmti.so.$(VERSION)
+SLDFLAGS=-shared -Wl,-soname -Wl,$(VLIBJVMTI)
+SOLIBEXT=so
+
+# The following works at least on fedora 23, you may need the next
+# line for other distros.
+ifneq (,$(wildcard /usr/sbin/update-java-alternatives))
+JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+else
+  ifneq (,$(wildcard /usr/sbin/alternatives))
+    JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
+  endif
+endif
+ifndef JDIR
+$(error Could not find alternatives command, you need to set JDIR= to point to the root of your Java directory)
+else
+  ifeq (,$(wildcard $(JDIR)/include/jvmti.h))
+  $(error the openjdk development package appears to me missing, install and try again)
+  endif
+endif
+$(info Using Java from $(JDIR))
+# -lrt required in 32-bit mode for clock_gettime()
+LIBS=-lelf -lrt
+INCDIR=-I $(JDIR)/include -I $(JDIR)/include/linux
+
+TARGETS=$(SLIBJVMTI)
+
+SRCS=libjvmti.c jvmti_agent.c
+OBJS=$(SRCS:.c=.o)
+SOBJS=$(OBJS:.o=.lo)
+OPT=-O2 -g -Werror -Wall
+
+CFLAGS=$(INCDIR) $(OPT)
+
+all: $(TARGETS)
+
+.c.o:
+       $(CC) $(CFLAGS) -c $*.c
+.c.lo:
+       $(CC) -fPIC -DPIC $(CFLAGS) -c $*.c -o $*.lo
+
+$(OBJS) $(SOBJS): Makefile jvmti_agent.h ../util/jitdump.h
+
+$(SLIBJVMTI):  $(SOBJS)
+       $(CC) $(CFLAGS) $(SLDFLAGS)  -o $@ $(SOBJS) $(LIBS)
+       $(LN) $@ libjvmti.$(SOLIBEXT)
+
+clean:
+       $(RM) -f *.o *.so.* *.so *.lo
+
+install:
+       -mkdir -p $(DESTDIR)/lib
+       install -m 755 $(SLIBJVMTI) $(DESTDIR)/lib/
+       (cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) $(VLIBJVMTI))
+       (cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) libjvmti.$(SOLIBEXT))
+       ldconfig
+
+.SUFFIXES: .c .S .o .lo
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c

new file mode 100644 (file)

index 0000000..6461e02
--- /dev/null
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -0,0 +1,465 @@
+/*
+ * jvmti_agent.c: JVMTI agent interface
+ *
+ * Adapted from the Oprofile code in opagent.c:
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#include <sys/types.h>
+#include <sys/stat.h> /* for mkdir() */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/mman.h>
+#include <syscall.h> /* for gettid() */
+#include <err.h>
+
+#include "jvmti_agent.h"
+#include "../util/jitdump.h"
+
+#define JIT_LANG "java"
+
+static char jit_path[PATH_MAX];
+static void *marker_addr;
+
+/*
+ * padding buffer
+ */
+static const char pad_bytes[7];
+
+static inline pid_t gettid(void)
+{
+       return (pid_t)syscall(__NR_gettid);
+}
+
+static int get_e_machine(struct jitheader *hdr)
+{
+       ssize_t sret;
+       char id[16];
+       int fd, ret = -1;
+       int m = -1;
+       struct {
+               uint16_t e_type;
+               uint16_t e_machine;
+       } info;
+
+       fd = open("/proc/self/exe", O_RDONLY);
+       if (fd == -1)
+               return -1;
+
+       sret = read(fd, id, sizeof(id));
+       if (sret != sizeof(id))
+               goto error;
+
+       /* check ELF signature */
+       if (id[0] != 0x7f || id[1] != 'E' || id[2] != 'L' || id[3] != 'F')
+               goto error;
+
+       sret = read(fd, &info, sizeof(info));
+       if (sret != sizeof(info))
+               goto error;
+
+       m = info.e_machine;
+       if (m < 0)
+               m = 0; /* ELF EM_NONE */
+
+       hdr->elf_mach = m;
+       ret = 0;
+error:
+       close(fd);
+       return ret;
+}
+
+#define NSEC_PER_SEC   1000000000
+static int perf_clk_id = CLOCK_MONOTONIC;
+
+static inline uint64_t
+timespec_to_ns(const struct timespec *ts)
+{
+        return ((uint64_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+static inline uint64_t
+perf_get_timestamp(void)
+{
+       struct timespec ts;
+       int ret;
+
+       ret = clock_gettime(perf_clk_id, &ts);
+       if (ret)
+               return 0;
+
+       return timespec_to_ns(&ts);
+}
+
+static int
+debug_cache_init(void)
+{
+       char str[32];
+       char *base, *p;
+       struct tm tm;
+       time_t t;
+       int ret;
+
+       time(&t);
+       localtime_r(&t, &tm);
+
+       base = getenv("JITDUMPDIR");
+       if (!base)
+               base = getenv("HOME");
+       if (!base)
+               base = ".";
+
+       strftime(str, sizeof(str), JIT_LANG"-jit-%Y%m%d", &tm);
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/", base);
+
+       ret = mkdir(jit_path, 0755);
+       if (ret == -1) {
+               if (errno != EEXIST) {
+                       warn("jvmti: cannot create jit cache dir %s", jit_path);
+                       return -1;
+               }
+       }
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit", base);
+       ret = mkdir(jit_path, 0755);
+       if (ret == -1) {
+               if (errno != EEXIST) {
+                       warn("cannot create jit cache dir %s", jit_path);
+                       return -1;
+               }
+       }
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit/%s.XXXXXXXX", base, str);
+
+       p = mkdtemp(jit_path);
+       if (p != jit_path) {
+               warn("cannot create jit cache dir %s", jit_path);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+perf_open_marker_file(int fd)
+{
+       long pgsz;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+       if (pgsz == -1)
+               return -1;
+
+       /*
+        * we mmap the jitdump to create an MMAP RECORD in perf.data file.
+        * The mmap is captured either live (perf record running when we mmap)
+        * or  in deferred mode, via /proc/PID/maps
+        * the MMAP record is used as a marker of a jitdump file for more meta
+        * data info about the jitted code. Perf report/annotate detect this
+        * special filename and process the jitdump file.
+        *
+        * mapping must be PROT_EXEC to ensure it is captured by perf record
+        * even when not using -d option
+        */
+       marker_addr = mmap(NULL, pgsz, PROT_READ|PROT_EXEC, MAP_PRIVATE, fd, 0);
+       return (marker_addr == MAP_FAILED) ? -1 : 0;
+}
+
+static void
+perf_close_marker_file(void)
+{
+       long pgsz;
+
+       if (!marker_addr)
+               return;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+       if (pgsz == -1)
+               return;
+
+       munmap(marker_addr, pgsz);
+}
+
+void *jvmti_open(void)
+{
+       int pad_cnt;
+       char dump_path[PATH_MAX];
+       struct jitheader header;
+       int fd;
+       FILE *fp;
+
+       /*
+        * check if clockid is supported
+        */
+       if (!perf_get_timestamp())
+               warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+
+       memset(&header, 0, sizeof(header));
+
+       debug_cache_init();
+
+       /*
+        * jitdump file name
+        */
+       snprintf(dump_path, PATH_MAX, "%s/jit-%i.dump", jit_path, getpid());
+
+       fd = open(dump_path, O_CREAT|O_TRUNC|O_RDWR, 0666);
+       if (fd == -1)
+               return NULL;
+
+       /*
+        * create perf.data maker for the jitdump file
+        */
+       if (perf_open_marker_file(fd)) {
+               warnx("jvmti: failed to create marker file");
+               return NULL;
+       }
+
+       fp = fdopen(fd, "w+");
+       if (!fp) {
+               warn("jvmti: cannot create %s", dump_path);
+               close(fd);
+               goto error;
+       }
+
+       warnx("jvmti: jitdump in %s", dump_path);
+
+       if (get_e_machine(&header)) {
+               warn("get_e_machine failed\n");
+               goto error;
+       }
+
+       header.magic      = JITHEADER_MAGIC;
+       header.version    = JITHEADER_VERSION;
+       header.total_size = sizeof(header);
+       header.pid        = getpid();
+
+       /* calculate amount of padding '\0' */
+       pad_cnt = PADDING_8ALIGNED(header.total_size);
+       header.total_size += pad_cnt;
+
+       header.timestamp = perf_get_timestamp();
+
+       if (!fwrite(&header, sizeof(header), 1, fp)) {
+               warn("jvmti: cannot write dumpfile header");
+               goto error;
+       }
+
+       /* write padding '\0' if necessary */
+       if (pad_cnt && !fwrite(pad_bytes, pad_cnt, 1, fp)) {
+               warn("jvmti: cannot write dumpfile header padding");
+               goto error;
+       }
+
+       return fp;
+error:
+       fclose(fp);
+       return NULL;
+}
+
+int
+jvmti_close(void *agent)
+{
+       struct jr_code_close rec;
+       FILE *fp = agent;
+
+       if (!fp) {
+               warnx("jvmti: incalid fd in close_agent");
+               return -1;
+       }
+
+       rec.p.id = JIT_CODE_CLOSE;
+       rec.p.total_size = sizeof(rec);
+
+       rec.p.timestamp = perf_get_timestamp();
+
+       if (!fwrite(&rec, sizeof(rec), 1, fp))
+               return -1;
+
+       fclose(fp);
+
+       fp = NULL;
+
+       perf_close_marker_file();
+
+       return 0;
+}
+
+int
+jvmti_write_code(void *agent, char const *sym,
+       uint64_t vma, void const *code, unsigned int const size)
+{
+       static int code_generation = 1;
+       struct jr_code_load rec;
+       size_t sym_len;
+       size_t padding_count;
+       FILE *fp = agent;
+       int ret = -1;
+
+       /* don't care about 0 length function, no samples */
+       if (size == 0)
+               return 0;
+
+       if (!fp) {
+               warnx("jvmti: invalid fd in write_native_code");
+               return -1;
+       }
+
+       sym_len = strlen(sym) + 1;
+
+       rec.p.id           = JIT_CODE_LOAD;
+       rec.p.total_size   = sizeof(rec) + sym_len;
+       padding_count      = PADDING_8ALIGNED(rec.p.total_size);
+       rec.p. total_size += padding_count;
+       rec.p.timestamp    = perf_get_timestamp();
+
+       rec.code_size  = size;
+       rec.vma        = vma;
+       rec.code_addr  = vma;
+       rec.pid        = getpid();
+       rec.tid        = gettid();
+
+       if (code)
+               rec.p.total_size += size;
+
+       /*
+        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * may be possible, so protect file writes
+        */
+       flockfile(fp);
+
+       /*
+        * get code index inside lock to avoid race condition
+        */
+       rec.code_index = code_generation++;
+
+       ret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+       fwrite_unlocked(sym, sym_len, 1, fp);
+
+       if (padding_count)
+               fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+
+       if (code)
+               fwrite_unlocked(code, size, 1, fp);
+
+       funlockfile(fp);
+
+       ret = 0;
+
+       return ret;
+}
+
+int
+jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
+                      jvmti_line_info_t *li, int nr_lines)
+{
+       struct jr_code_debug_info rec;
+       size_t sret, len, size, flen;
+       size_t padding_count;
+       uint64_t addr;
+       const char *fn = file;
+       FILE *fp = agent;
+       int i;
+
+       /*
+        * no entry to write
+        */
+       if (!nr_lines)
+               return 0;
+
+       if (!fp) {
+               warnx("jvmti: invalid fd in write_debug_info");
+               return -1;
+       }
+
+       flen = strlen(file) + 1;
+
+       rec.p.id        = JIT_CODE_DEBUG_INFO;
+       size            = sizeof(rec);
+       rec.p.timestamp = perf_get_timestamp();
+       rec.code_addr   = (uint64_t)(uintptr_t)code;
+       rec.nr_entry    = nr_lines;
+
+       /*
+        * on disk source line info layout:
+        * uint64_t : addr
+        * int      : line number
+        * int      : column discriminator
+        * file[]   : source file name
+        * padding  : pad to multiple of 8 bytes
+        */
+       size += nr_lines * sizeof(struct debug_entry);
+       size += flen * nr_lines;
+       /*
+        * pad to 8 bytes
+        */
+       padding_count = PADDING_8ALIGNED(size);
+
+       rec.p.total_size = size + padding_count;
+
+       /*
+        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * may be possible, so protect file writes
+        */
+       flockfile(fp);
+
+       sret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+       if (sret != 1)
+               goto error;
+
+       for (i = 0; i < nr_lines; i++) {
+
+               addr = (uint64_t)li[i].pc;
+               len  = sizeof(addr);
+               sret = fwrite_unlocked(&addr, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               len  = sizeof(li[0].line_number);
+               sret = fwrite_unlocked(&li[i].line_number, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               len  = sizeof(li[0].discrim);
+               sret = fwrite_unlocked(&li[i].discrim, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               sret = fwrite_unlocked(fn, flen, 1, fp);
+               if (sret != 1)
+                       goto error;
+       }
+       if (padding_count)
+               sret = fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+       funlockfile(fp);
+       return 0;
+error:
+       funlockfile(fp);
+       return -1;
+}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h

new file mode 100644 (file)

index 0000000..bedf5d0
--- /dev/null
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -0,0 +1,36 @@
+#ifndef __JVMTI_AGENT_H__
+#define __JVMTI_AGENT_H__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <jvmti.h>
+
+#define __unused __attribute__((unused))
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct {
+       unsigned long   pc;
+       int             line_number;
+       int             discrim; /* discriminator -- 0 for now */
+} jvmti_line_info_t;
+
+void *jvmti_open(void);
+int   jvmti_close(void *agent);
+int   jvmti_write_code(void *agent, char const *symbol_name,
+                      uint64_t vma, void const *code,
+                      const unsigned int code_size);
+
+int   jvmti_write_debug_info(void *agent,
+                            uint64_t code,
+                            const char *file,
+                            jvmti_line_info_t *li,
+                            int nr_lines);
+
+#if defined(__cplusplus)
+}
+
+#endif
+#endif /* __JVMTI_H__ */
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c

new file mode 100644 (file)

index 0000000..ac12e4b
--- /dev/null
+++ b/tools/perf/jvmti/libjvmti.c
@@ -0,0 +1,304 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <err.h>
+#include <jvmti.h>
+#include <jvmticmlr.h>
+#include <limits.h>
+
+#include "jvmti_agent.h"
+
+static int has_line_numbers;
+void *jvmti_agent;
+
+static jvmtiError
+do_get_line_numbers(jvmtiEnv *jvmti, void *pc, jmethodID m, jint bci,
+                   jvmti_line_info_t *tab, jint *nr)
+{
+       jint i, lines = 0;
+       jint nr_lines = 0;
+       jvmtiLineNumberEntry *loc_tab = NULL;
+       jvmtiError ret;
+
+       ret = (*jvmti)->GetLineNumberTable(jvmti, m, &nr_lines, &loc_tab);
+       if (ret != JVMTI_ERROR_NONE)
+               return ret;
+
+       for (i = 0; i < nr_lines; i++) {
+               if (loc_tab[i].start_location < bci) {
+                       tab[lines].pc = (unsigned long)pc;
+                       tab[lines].line_number = loc_tab[i].line_number;
+                       tab[lines].discrim = 0; /* not yet used */
+                       lines++;
+               } else {
+                       break;
+               }
+       }
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)loc_tab);
+       *nr = lines;
+       return JVMTI_ERROR_NONE;
+}
+
+static jvmtiError
+get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **tab, int *nr_lines)
+{
+       const jvmtiCompiledMethodLoadRecordHeader *hdr;
+       jvmtiCompiledMethodLoadInlineRecord *rec;
+       jvmtiLineNumberEntry *lne = NULL;
+       PCStackInfo *c;
+       jint nr, ret;
+       int nr_total = 0;
+       int i, lines_total = 0;
+
+       if (!(tab && nr_lines))
+               return JVMTI_ERROR_NULL_POINTER;
+
+       /*
+        * Phase 1 -- get the number of lines necessary
+        */
+       for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+               if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+                       rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+                       for (i = 0; i < rec->numpcs; i++) {
+                               c = rec->pcinfo + i;
+                               nr = 0;
+                               /*
+                                * unfortunately, need a tab to get the number of lines!
+                                */
+                               ret = (*jvmti)->GetLineNumberTable(jvmti, c->methods[0], &nr, &lne);
+                               if (ret == JVMTI_ERROR_NONE) {
+                                       /* free what was allocated for nothing */
+                                       (*jvmti)->Deallocate(jvmti, (unsigned char *)lne);
+                                       nr_total += (int)nr;
+                               }
+                       }
+               }
+       }
+
+       if (nr_total == 0)
+               return JVMTI_ERROR_NOT_FOUND;
+
+       /*
+        * Phase 2 -- allocate big enough line table
+        */
+       *tab = malloc(nr_total * sizeof(**tab));
+       if (!*tab)
+               return JVMTI_ERROR_OUT_OF_MEMORY;
+
+       for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+               if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+                       rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+                       for (i = 0; i < rec->numpcs; i++) {
+                               c = rec->pcinfo + i;
+                               nr = 0;
+                               ret = do_get_line_numbers(jvmti, c->pc,
+                                                         c->methods[0],
+                                                         c->bcis[0],
+                                                         *tab + lines_total,
+                                                         &nr);
+                               if (ret == JVMTI_ERROR_NONE)
+                                       lines_total += nr;
+                       }
+               }
+       }
+       *nr_lines = lines_total;
+       return JVMTI_ERROR_NONE;
+}
+
+static void JNICALL
+compiled_method_load_cb(jvmtiEnv *jvmti,
+                       jmethodID method,
+                       jint code_size,
+                       void const *code_addr,
+                       jint map_length,
+                       jvmtiAddrLocationMap const *map,
+                       const void *compile_info)
+{
+       jvmti_line_info_t *line_tab = NULL;
+       jclass decl_class;
+       char *class_sign = NULL;
+       char *func_name = NULL;
+       char *func_sign = NULL;
+       char *file_name= NULL;
+       char fn[PATH_MAX];
+       uint64_t addr = (uint64_t)(uintptr_t)code_addr;
+       jvmtiError ret;
+       int nr_lines = 0; /* in line_tab[] */
+       size_t len;
+
+       ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
+                                               &decl_class);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot get declaring class");
+               return;
+       }
+
+       if (has_line_numbers && map && map_length) {
+               ret = get_line_numbers(jvmti, compile_info, &line_tab, &nr_lines);
+               if (ret != JVMTI_ERROR_NONE) {
+                       warnx("jvmti: cannot get line table for method");
+                       nr_lines = 0;
+               }
+       }
+
+       ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot get source filename ret=%d", ret);
+               goto error;
+       }
+
+       ret = (*jvmti)->GetClassSignature(jvmti, decl_class,
+                                         &class_sign, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: getclassignature failed");
+               goto error;
+       }
+
+       ret = (*jvmti)->GetMethodName(jvmti, method, &func_name,
+                                     &func_sign, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: failed getmethodname");
+               goto error;
+       }
+
+       /*
+        * Assume path name is class hierarchy, this is a common practice with Java programs
+        */
+       if (*class_sign == 'L') {
+               int j, i = 0;
+               char *p = strrchr(class_sign, '/');
+               if (p) {
+                       /* drop the 'L' prefix and copy up to the final '/' */
+                       for (i = 0; i < (p - class_sign); i++)
+                               fn[i] = class_sign[i+1];
+               }
+               /*
+                * append file name, we use loops and not string ops to avoid modifying
+                * class_sign which is used later for the symbol name
+                */
+               for (j = 0; i < (PATH_MAX - 1) && file_name && j < strlen(file_name); j++, i++)
+                       fn[i] = file_name[j];
+               fn[i] = '\0';
+       } else {
+               /* fallback case */
+               strcpy(fn, file_name);
+       }
+       /*
+        * write source line info record if we have it
+        */
+       if (jvmti_write_debug_info(jvmti_agent, addr, fn, line_tab, nr_lines))
+               warnx("jvmti: write_debug_info() failed");
+
+       len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
+       {
+               char str[len];
+               snprintf(str, len, "%s%s%s", class_sign, func_name, func_sign);
+
+               if (jvmti_write_code(jvmti_agent, str, addr, code_addr, code_size))
+                       warnx("jvmti: write_code() failed");
+       }
+error:
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)func_name);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)func_sign);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+       free(line_tab);
+}
+
+static void JNICALL
+code_generated_cb(jvmtiEnv *jvmti,
+                 char const *name,
+                 void const *code_addr,
+                 jint code_size)
+{
+       uint64_t addr = (uint64_t)(unsigned long)code_addr;
+       int ret;
+
+       ret = jvmti_write_code(jvmti_agent, name, addr, code_addr, code_size);
+       if (ret)
+               warnx("jvmti: write_code() failed for code_generated");
+}
+
+JNIEXPORT jint JNICALL
+Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused)
+{
+       jvmtiEventCallbacks cb;
+       jvmtiCapabilities caps1;
+       jvmtiJlocationFormat format;
+       jvmtiEnv *jvmti = NULL;
+       jint ret;
+
+       jvmti_agent = jvmti_open();
+       if (!jvmti_agent) {
+               warnx("jvmti: open_agent failed");
+               return -1;
+       }
+
+       /*
+        * Request a JVMTI interface version 1 environment
+        */
+       ret = (*jvm)->GetEnv(jvm, (void *)&jvmti, JVMTI_VERSION_1);
+       if (ret != JNI_OK) {
+               warnx("jvmti: jvmti version 1 not supported");
+               return -1;
+       }
+
+       /*
+        * acquire method_load capability, we require it
+        * request line numbers (optional)
+        */
+       memset(&caps1, 0, sizeof(caps1));
+       caps1.can_generate_compiled_method_load_events = 1;
+
+       ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: acquire compiled_method capability failed");
+               return -1;
+       }
+       ret = (*jvmti)->GetJLocationFormat(jvmti, &format);
+        if (ret == JVMTI_ERROR_NONE && format == JVMTI_JLOCATION_JVMBCI) {
+                memset(&caps1, 0, sizeof(caps1));
+                caps1.can_get_line_numbers = 1;
+                caps1.can_get_source_file_name = 1;
+               ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+                if (ret == JVMTI_ERROR_NONE)
+                        has_line_numbers = 1;
+        }
+
+       memset(&cb, 0, sizeof(cb));
+
+       cb.CompiledMethodLoad   = compiled_method_load_cb;
+       cb.DynamicCodeGenerated = code_generated_cb;
+
+       ret = (*jvmti)->SetEventCallbacks(jvmti, &cb, sizeof(cb));
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot set event callbacks");
+               return -1;
+       }
+
+       ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+                       JVMTI_EVENT_COMPILED_METHOD_LOAD, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: setnotification failed for method_load");
+               return -1;
+       }
+
+       ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+                       JVMTI_EVENT_DYNAMIC_CODE_GENERATED, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: setnotification failed on code_generated");
+               return -1;
+       }
+       return 0;
+}
+
+JNIEXPORT void JNICALL
+Agent_OnUnload(JavaVM *jvm __unused)
+{
+       int ret;
+
+       ret = jvmti_close(jvmti_agent);
+       if (ret)
+               errx(1, "Error: op_close_agent()");
+}
diff --git a/tools/perf/perf.c b/tools/perf/perf.c

index a929618b8eb616f90c9bb8e26bad5416a0fdbc19..aaee0a7827477810c5c0d2d82753545592d7f22f 100644 (file)
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -454,11 +454,12 @@ static void handle_internal_command(int argc, const char **argv)
  
  static void execv_dashed_external(const char **argv)
  {
-       struct strbuf cmd = STRBUF_INIT;
+       char *cmd;
         const char *tmp;
         int status;
  
-       strbuf_addf(&cmd, "perf-%s", argv[0]);
+       if (asprintf(&cmd, "perf-%s", argv[0]) < 0)
+               goto do_die;
  
         /*
          * argv[0] must be the perf command, but the argv array
@@ -467,7 +468,7 @@ static void execv_dashed_external(const char **argv)
          * restore it on error.
          */
         tmp = argv[0];
-       argv[0] = cmd.buf;
+       argv[0] = cmd;
  
         /*
          * if we fail because the command is not found, it is
@@ -475,15 +476,16 @@ static void execv_dashed_external(const char **argv)
          */
         status = run_command_v_opt(argv, 0);
         if (status != -ERR_RUN_COMMAND_EXEC) {
-               if (IS_RUN_COMMAND_ERR(status))
+               if (IS_RUN_COMMAND_ERR(status)) {
+do_die:
                         die("unable to run '%s'", argv[0]);
+               }
                 exit(-status);
         }
         errno = ENOENT; /* as if we called execvp */
  
         argv[0] = tmp;
-
-       strbuf_release(&cmd);
+       zfree(&cmd);
  }
  
  static int run_argv(int *argcp, const char ***argv)
@@ -546,6 +548,8 @@ int main(int argc, const char **argv)
  
         srandom(time(NULL));
  
+       perf_config(perf_default_config, NULL);
+
         /* get debugfs/tracefs mount point from /proc/mounts */
         tracing_path_mount();
  
@@ -613,6 +617,8 @@ int main(int argc, const char **argv)
          */
         pthread__block_sigwinch();
  
+       perf_debug_setup();
+
         while (1) {
                 static int done_help;
                 int was_alias = run_argv(&argc, &argv);
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

index 90129accffbe824e37d9831ba3b323011295f78a..5381a01c0610c0e61f079140ed5cdc2df3f87b0d 100644 (file)
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -58,6 +58,8 @@ struct record_opts {
         bool         full_auxtrace;
         bool         auxtrace_snapshot_mode;
         bool         record_switch_events;
+       bool         all_kernel;
+       bool         all_user;
         unsigned int freq;
         unsigned int mmap_pages;
         unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py

index 15c8400240fd9029ae34fca077304337d9c75ca6..1d95009592eb3a8a8d053e1340a3e42dd412a68c 100644 (file)
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -71,7 +71,10 @@ try:
  except:
         if not audit_package_warned:
                 audit_package_warned = True
-               print "Install the audit-libs-python package to get syscall names"
+               print "Install the audit-libs-python package to get syscall names.\n" \
+                    "For example:\n  # apt-get install python-audit (Ubuntu)" \
+                    "\n  # yum install audit-libs-python (Fedora)" \
+                    "\n  etc.\n"
  
  def syscall_name(id):
         try:
diff --git a/tools/perf/tests/.gitignore b/tools/perf/tests/.gitignore

index bf016c439fbd10a3cada60253c98d60ce441aa98..8cc30e731c739495f3eb61a0da9a76246ad35600 100644 (file)
--- a/tools/perf/tests/.gitignore
+++ b/tools/perf/tests/.gitignore
@@ -1,3 +1,4 @@
  llvm-src-base.c
  llvm-src-kbuild.c
  llvm-src-prologue.c
+llvm-src-relocation.c
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build

index 614899b88b377e07f9615d618ba7045f66051bea..1ba628ed049adbafc27c7b8900ecb838165a2aa7 100644 (file)
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -31,7 +31,7 @@ perf-y += sample-parsing.o
  perf-y += parse-no-sample-id-all.o
  perf-y += kmod-path.o
  perf-y += thread-map.o
-perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
+perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o llvm-src-relocation.o
  perf-y += bpf.o
  perf-y += topology.o
  perf-y += cpumap.o
@@ -59,6 +59,13 @@ $(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c tests/Build
         $(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
         $(Q)echo ';' >> $@
  
+$(OUTPUT)tests/llvm-src-relocation.c: tests/bpf-script-test-relocation.c tests/Build
+       $(call rule_mkdir)
+       $(Q)echo '#include <tests/llvm.h>' > $@
+       $(Q)echo 'const char test_llvm__bpf_test_relocation[] =' >> $@
+       $(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
+       $(Q)echo ';' >> $@
+
  ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
  perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
  endif
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c

index fb80c9eb6a95b67947b23adaeca69b5e57bac704..e7664fe3bd33739fd92be2579c30102e481e8f03 100644 (file)
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -29,14 +29,59 @@
  
  static int fd1;
  static int fd2;
+static int fd3;
  static int overflows;
+static int overflows_2;
+
+volatile long the_var;
+
+
+/*
+ * Use ASM to ensure watchpoint and breakpoint can be triggered
+ * at one instruction.
+ */
+#if defined (__x86_64__)
+extern void __test_function(volatile long *ptr);
+asm (
+       ".globl __test_function\n"
+       "__test_function:\n"
+       "incq (%rdi)\n"
+       "ret\n");
+#elif defined (__aarch64__)
+extern void __test_function(volatile long *ptr);
+asm (
+       ".globl __test_function\n"
+       "__test_function:\n"
+       "str x30, [x0]\n"
+       "ret\n");
+
+#else
+static void __test_function(volatile long *ptr)
+{
+       *ptr = 0x1234;
+}
+#endif
  
  __attribute__ ((noinline))
  static int test_function(void)
  {
+       __test_function(&the_var);
+       the_var++;
         return time(NULL);
  }
  
+static void sig_handler_2(int signum __maybe_unused,
+                         siginfo_t *oh __maybe_unused,
+                         void *uc __maybe_unused)
+{
+       overflows_2++;
+       if (overflows_2 > 10) {
+               ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
+       }
+}
+
  static void sig_handler(int signum __maybe_unused,
                         siginfo_t *oh __maybe_unused,
                         void *uc __maybe_unused)
@@ -54,10 +99,11 @@ static void sig_handler(int signum __maybe_unused,
                  */
                 ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
                 ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
         }
  }
  
-static int bp_event(void *fn, int setup_signal)
+static int __event(bool is_x, void *addr, int sig)
  {
         struct perf_event_attr pe;
         int fd;
@@ -67,8 +113,8 @@ static int bp_event(void *fn, int setup_signal)
         pe.size = sizeof(struct perf_event_attr);
  
         pe.config = 0;
-       pe.bp_type = HW_BREAKPOINT_X;
-       pe.bp_addr = (unsigned long) fn;
+       pe.bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W;
+       pe.bp_addr = (unsigned long) addr;
         pe.bp_len = sizeof(long);
  
         pe.sample_period = 1;
@@ -86,17 +132,25 @@ static int bp_event(void *fn, int setup_signal)
                 return TEST_FAIL;
         }
  
-       if (setup_signal) {
-               fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
-               fcntl(fd, F_SETSIG, SIGIO);
-               fcntl(fd, F_SETOWN, getpid());
-       }
+       fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
+       fcntl(fd, F_SETSIG, sig);
+       fcntl(fd, F_SETOWN, getpid());
  
         ioctl(fd, PERF_EVENT_IOC_RESET, 0);
  
         return fd;
  }
  
+static int bp_event(void *addr, int sig)
+{
+       return __event(true, addr, sig);
+}
+
+static int wp_event(void *addr, int sig)
+{
+       return __event(false, addr, sig);
+}
+
  static long long bp_count(int fd)
  {
         long long count;
@@ -114,7 +168,7 @@ static long long bp_count(int fd)
  int test__bp_signal(int subtest __maybe_unused)
  {
         struct sigaction sa;
-       long long count1, count2;
+       long long count1, count2, count3;
  
         /* setup SIGIO signal handler */
         memset(&sa, 0, sizeof(struct sigaction));
@@ -126,21 +180,52 @@ int test__bp_signal(int subtest __maybe_unused)
                 return TEST_FAIL;
         }
  
+       sa.sa_sigaction = (void *) sig_handler_2;
+       if (sigaction(SIGUSR1, &sa, NULL) < 0) {
+               pr_debug("failed setting up signal handler 2\n");
+               return TEST_FAIL;
+       }
+
         /*
          * We create following events:
          *
-        * fd1 - breakpoint event on test_function with SIGIO
+        * fd1 - breakpoint event on __test_function with SIGIO
          *       signal configured. We should get signal
          *       notification each time the breakpoint is hit
          *
-        * fd2 - breakpoint event on sig_handler without SIGIO
+        * fd2 - breakpoint event on sig_handler with SIGUSR1
+        *       configured. We should get SIGUSR1 each time when
+        *       breakpoint is hit
+        *
+        * fd3 - watchpoint event on __test_function with SIGIO
          *       configured.
          *
          * Following processing should happen:
-        *   - execute test_function
-        *   - fd1 event breakpoint hit -> count1 == 1
-        *   - SIGIO is delivered       -> overflows == 1
-        *   - fd2 event breakpoint hit -> count2 == 1
+        *   Exec:               Action:                       Result:
+        *   incq (%rdi)       - fd1 event breakpoint hit   -> count1 == 1
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 1
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 1  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows = 1
+        *   sys_rt_sigreturn  - return from sig_handler
+        *   incq (%rdi)       - fd3 event watchpoint hit   -> count3 == 1       (wp and bp in one insn)
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 2
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 2  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows = 2
+        *   sys_rt_sigreturn  - return from sig_handler
+        *   the_var++         - fd3 event watchpoint hit   -> count3 == 2       (standalone watchpoint)
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 3
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 3  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows == 3
+        *   sys_rt_sigreturn  - return from sig_handler
          *
          * The test case check following error conditions:
          * - we get stuck in signal handler because of debug
@@ -152,11 +237,13 @@ int test__bp_signal(int subtest __maybe_unused)
          *
          */
  
-       fd1 = bp_event(test_function, 1);
-       fd2 = bp_event(sig_handler, 0);
+       fd1 = bp_event(__test_function, SIGIO);
+       fd2 = bp_event(sig_handler, SIGUSR1);
+       fd3 = wp_event((void *)&the_var, SIGIO);
  
         ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0);
         ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0);
+       ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
  
         /*
          * Kick off the test by trigering 'fd1'
@@ -166,15 +253,18 @@ int test__bp_signal(int subtest __maybe_unused)
  
         ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
         ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+       ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
  
         count1 = bp_count(fd1);
         count2 = bp_count(fd2);
+       count3 = bp_count(fd3);
  
         close(fd1);
         close(fd2);
+       close(fd3);
  
-       pr_debug("count1 %lld, count2 %lld, overflow %d\n",
-                count1, count2, overflows);
+       pr_debug("count1 %lld, count2 %lld, count3 %lld, overflow %d, overflows_2 %d\n",
+                count1, count2, count3, overflows, overflows_2);
  
         if (count1 != 1) {
                 if (count1 == 11)
@@ -183,12 +273,18 @@ int test__bp_signal(int subtest __maybe_unused)
                         pr_debug("failed: wrong count for bp1%lld\n", count1);
         }
  
-       if (overflows != 1)
+       if (overflows != 3)
                 pr_debug("failed: wrong overflow hit\n");
  
-       if (count2 != 1)
+       if (overflows_2 != 3)
+               pr_debug("failed: wrong overflow_2 hit\n");
+
+       if (count2 != 3)
                 pr_debug("failed: wrong count for bp2\n");
  
-       return count1 == 1 && overflows == 1 && count2 == 1 ?
+       if (count3 != 2)
+               pr_debug("failed: wrong count for bp3\n");
+
+       return count1 == 1 && overflows == 3 && count2 == 3 && overflows_2 == 3 && count3 == 2 ?
                 TEST_OK : TEST_FAIL;
  }
diff --git a/tools/perf/tests/bpf-script-test-relocation.c b/tools/perf/tests/bpf-script-test-relocation.c

new file mode 100644 (file)

index 0000000..93af774
--- /dev/null
+++ b/tools/perf/tests/bpf-script-test-relocation.c
@@ -0,0 +1,50 @@
+/*
+ * bpf-script-test-relocation.c
+ * Test BPF loader checking relocation
+ */
+#ifndef LINUX_VERSION_CODE
+# error Need LINUX_VERSION_CODE
+# error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
+#endif
+#define BPF_ANY 0
+#define BPF_MAP_TYPE_ARRAY 2
+#define BPF_FUNC_map_lookup_elem 1
+#define BPF_FUNC_map_update_elem 2
+
+static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+       (void *) BPF_FUNC_map_lookup_elem;
+static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) =
+       (void *) BPF_FUNC_map_update_elem;
+
+struct bpf_map_def {
+       unsigned int type;
+       unsigned int key_size;
+       unsigned int value_size;
+       unsigned int max_entries;
+};
+
+#define SEC(NAME) __attribute__((section(NAME), used))
+struct bpf_map_def SEC("maps") my_table = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 1,
+};
+
+int this_is_a_global_val;
+
+SEC("func=sys_write")
+int bpf_func__sys_write(void *ctx)
+{
+       int key = 0;
+       int value = 0;
+
+       /*
+        * Incorrect relocation. Should not allow this program be
+        * loaded into kernel.
+        */
+       bpf_map_update_elem(&this_is_a_global_val, &key, &value, 0);
+       return 0;
+}
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c

index 33689a0cf821e5b9608e54b981e3280ee580890c..199501c71e272491850065910aae5003603ab10e 100644 (file)
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -1,7 +1,11 @@
  #include <stdio.h>
  #include <sys/epoll.h>
+#include <util/util.h>
  #include <util/bpf-loader.h>
  #include <util/evlist.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <bpf/bpf.h>
  #include "tests.h"
  #include "llvm.h"
  #include "debug.h"
@@ -71,6 +75,15 @@ static struct {
                 (NR_ITERS + 1) / 4,
         },
  #endif
+       {
+               LLVM_TESTCASE_BPF_RELOCATION,
+               "Test BPF relocation checker",
+               "[bpf_relocation_test]",
+               "fix 'perf test LLVM' first",
+               "libbpf error when dealing with relocation",
+               NULL,
+               0,
+       },
  };
  
  static int do_test(struct bpf_object *obj, int (*func)(void),
@@ -99,7 +112,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
         parse_evlist.error = &parse_error;
         INIT_LIST_HEAD(&parse_evlist.list);
  
-       err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj);
+       err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj, NULL);
         if (err || list_empty(&parse_evlist.list)) {
                 pr_debug("Failed to add events selected by BPF\n");
                 return TEST_FAIL;
@@ -190,7 +203,7 @@ static int __test__bpf(int idx)
  
         ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
                                        bpf_testcase_table[idx].prog_id,
-                                      true);
+                                      true, NULL);
         if (ret != TEST_OK || !obj_buf || !obj_buf_sz) {
                 pr_debug("Unable to get BPF object, %s\n",
                          bpf_testcase_table[idx].msg_compile_fail);
@@ -202,14 +215,21 @@ static int __test__bpf(int idx)
  
         obj = prepare_bpf(obj_buf, obj_buf_sz,
                           bpf_testcase_table[idx].name);
-       if (!obj) {
+       if ((!!bpf_testcase_table[idx].target_func) != (!!obj)) {
+               if (!obj)
+                       pr_debug("Fail to load BPF object: %s\n",
+                                bpf_testcase_table[idx].msg_load_fail);
+               else
+                       pr_debug("Success unexpectedly: %s\n",
+                                bpf_testcase_table[idx].msg_load_fail);
                 ret = TEST_FAIL;
                 goto out;
         }
  
-       ret = do_test(obj,
-                     bpf_testcase_table[idx].target_func,
-                     bpf_testcase_table[idx].expect_result);
+       if (obj)
+               ret = do_test(obj,
+                             bpf_testcase_table[idx].target_func,
+                             bpf_testcase_table[idx].expect_result);
  out:
         bpf__clear();
         return ret;
@@ -227,6 +247,36 @@ const char *test__bpf_subtest_get_desc(int i)
         return bpf_testcase_table[i].desc;
  }
  
+static int check_env(void)
+{
+       int err;
+       unsigned int kver_int;
+       char license[] = "GPL";
+
+       struct bpf_insn insns[] = {
+               BPF_MOV64_IMM(BPF_REG_0, 1),
+               BPF_EXIT_INSN(),
+       };
+
+       err = fetch_kernel_version(&kver_int, NULL, 0);
+       if (err) {
+               pr_debug("Unable to get kernel version\n");
+               return err;
+       }
+
+       err = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns,
+                              sizeof(insns) / sizeof(insns[0]),
+                              license, kver_int, NULL, 0);
+       if (err < 0) {
+               pr_err("Missing basic BPF support, skip this test: %s\n",
+                      strerror(errno));
+               return err;
+       }
+       close(err);
+
+       return 0;
+}
+
  int test__bpf(int i)
  {
         int err;
@@ -239,6 +289,9 @@ int test__bpf(int i)
                 return TEST_SKIP;
         }
  
+       if (check_env())
+               return TEST_SKIP;
+
         err = __test__bpf(i);
         return err;
  }
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c

index 313a48c6b2bc8e111e113e79c4a72fdc2d720ac5..afc9ad0a0515c5db77745d0d0c6964d950fa3bfc 100644 (file)
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -439,7 +439,7 @@ static int do_test_code_reading(bool try_kcore)
                 .mmap_pages          = UINT_MAX,
                 .user_freq           = UINT_MAX,
                 .user_interval       = ULLONG_MAX,
-               .freq                = 4000,
+               .freq                = 500,
                 .target              = {
                         .uses_mmap   = true,
                 },
@@ -559,7 +559,13 @@ static int do_test_code_reading(bool try_kcore)
                                 evlist = NULL;
                                 continue;
                         }
-                       pr_debug("perf_evlist__open failed\n");
+
+                       if (verbose) {
+                               char errbuf[512];
+                               perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
+                               pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
+                       }
+
                         goto out_put;
                 }
                 break;
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c

index 5e6a86e50fb97aae648c16ce627157e8377ebb02..ecf136c385d5ff2f21111e813a9ce1e30c03365a 100644 (file)
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -191,7 +191,7 @@ static int do_test(struct hists *hists, struct result *expected, size_t nr_expec
          * function since TEST_ASSERT_VAL() returns in case of failure.
          */
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(hists_to_evsel(hists), NULL);
  
         if (verbose > 2) {
                 pr_info("use callchain: %d, cumulate callchain: %d\n",
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c

index 351a42463444a3df9f80daea3848499b8d39659a..34b945a55d4d2864cc561f36275191f57f84210a 100644 (file)
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -145,7 +145,7 @@ int test__hists_filter(int subtest __maybe_unused)
                 struct hists *hists = evsel__hists(evsel);
  
                 hists__collapse_resort(hists, NULL);
-               hists__output_resort(hists, NULL);
+               perf_evsel__output_resort(evsel, NULL);
  
                 if (verbose > 2) {
                         pr_info("Normal histogram\n");
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c

index b231265148d89a28e39387d423711c643351eb70..23cce67c7e48902b86c00cc6e56df22ef16386e4 100644 (file)
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -156,7 +156,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -256,7 +256,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -310,7 +310,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -388,7 +388,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -491,7 +491,7 @@ static int test5(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c

index 06f45c1d42561df030e35d55078d70d910e79998..cff564fb4b66761f7f7fdcd5ee10727ad3b08726 100644 (file)
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -6,12 +6,6 @@
  #include "tests.h"
  #include "debug.h"
  
-static int perf_config_cb(const char *var, const char *val,
-                         void *arg __maybe_unused)
-{
-       return perf_default_config(var, val, arg);
-}
-
  #ifdef HAVE_LIBBPF_SUPPORT
  static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)
  {
@@ -35,6 +29,7 @@ static int test__bpf_parsing(void *obj_buf __maybe_unused,
  static struct {
         const char *source;
         const char *desc;
+       bool should_load_fail;
  } bpf_source_table[__LLVM_TESTCASE_MAX] = {
         [LLVM_TESTCASE_BASE] = {
                 .source = test_llvm__bpf_base_prog,
@@ -48,14 +43,19 @@ static struct {
                 .source = test_llvm__bpf_test_prologue_prog,
                 .desc = "Compile source for BPF prologue generation test",
         },
+       [LLVM_TESTCASE_BPF_RELOCATION] = {
+               .source = test_llvm__bpf_test_relocation,
+               .desc = "Compile source for BPF relocation test",
+               .should_load_fail = true,
+       },
  };
  
-
  int
  test_llvm__fetch_bpf_obj(void **p_obj_buf,
                          size_t *p_obj_buf_sz,
                          enum test_llvm__testcase idx,
-                        bool force)
+                        bool force,
+                        bool *should_load_fail)
  {
         const char *source;
         const char *desc;
@@ -68,8 +68,8 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
  
         source = bpf_source_table[idx].source;
         desc = bpf_source_table[idx].desc;
-
-       perf_config(perf_config_cb, NULL);
+       if (should_load_fail)
+               *should_load_fail = bpf_source_table[idx].should_load_fail;
  
         /*
          * Skip this test if user's .perfconfig doesn't set [llvm] section
@@ -136,14 +136,15 @@ int test__llvm(int subtest)
         int ret;
         void *obj_buf = NULL;
         size_t obj_buf_sz = 0;
+       bool should_load_fail = false;
  
         if ((subtest < 0) || (subtest >= __LLVM_TESTCASE_MAX))
                 return TEST_FAIL;
  
         ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
-                                      subtest, false);
+                                      subtest, false, &should_load_fail);
  
-       if (ret == TEST_OK) {
+       if (ret == TEST_OK && !should_load_fail) {
                 ret = test__bpf_parsing(obj_buf, obj_buf_sz);
                 if (ret != TEST_OK) {
                         pr_debug("Failed to parse test case '%s'\n",
diff --git a/tools/perf/tests/llvm.h b/tools/perf/tests/llvm.h

index 5150b4d6ef50afe357f5f86300f4d39d28bec658..0eaa604be99defec6dcf3b09ee9a75eb348096fb 100644 (file)
--- a/tools/perf/tests/llvm.h
+++ b/tools/perf/tests/llvm.h
@@ -7,14 +7,17 @@
  extern const char test_llvm__bpf_base_prog[];
  extern const char test_llvm__bpf_test_kbuild_prog[];
  extern const char test_llvm__bpf_test_prologue_prog[];
+extern const char test_llvm__bpf_test_relocation[];
  
  enum test_llvm__testcase {
         LLVM_TESTCASE_BASE,
         LLVM_TESTCASE_KBUILD,
         LLVM_TESTCASE_BPF_PROLOGUE,
+       LLVM_TESTCASE_BPF_RELOCATION,
         __LLVM_TESTCASE_MAX,
  };
  
  int test_llvm__fetch_bpf_obj(void **p_obj_buf, size_t *p_obj_buf_sz,
-                            enum test_llvm__testcase index, bool force);
+                            enum test_llvm__testcase index, bool force,
+                            bool *should_load_fail);
  #endif
diff --git a/tools/perf/tests/make b/tools/perf/tests/make

index f918015512af96d1173891dac2e8f460ff65ca85..cac15d93aea656f96ad449cba1f9529e42afcbec 100644 (file)
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -15,6 +15,7 @@ else
  PERF := .
  PERF_O := $(PERF)
  O_OPT :=
+FULL_O := $(shell readlink -f $(PERF_O) || echo $(PERF_O))
  
  ifneq ($(O),)
    FULL_O := $(shell readlink -f $(O) || echo $(O))
@@ -79,6 +80,7 @@ make_no_libaudit    := NO_LIBAUDIT=1
  make_no_libbionic   := NO_LIBBIONIC=1
  make_no_auxtrace    := NO_AUXTRACE=1
  make_no_libbpf     := NO_LIBBPF=1
+make_no_libcrypto   := NO_LIBCRYPTO=1
  make_tags           := tags
  make_cscope         := cscope
  make_help           := help
@@ -102,6 +104,7 @@ make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
  make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
  make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
  make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1 NO_LIBBPF=1
+make_minimal        += NO_LIBCRYPTO=1
  
  # $(run) contains all available tests
  run := make_pure
@@ -110,6 +113,9 @@ run := make_pure
  # disable features detection
  ifeq ($(MK),Makefile)
  run += make_clean_all
+MAKE_F := $(MAKE)
+else
+MAKE_F := $(MAKE) -f $(MK)
  endif
  run += make_python_perf_so
  run += make_debug
@@ -260,6 +266,8 @@ run := $(shell shuf -e $(run))
  run_O := $(shell shuf -e $(run_O))
  endif
  
+max_width := $(shell echo $(run_O) | sed 's/ /\n/g' | wc -L)
+
  ifdef DEBUG
  d := $(info run   $(run))
  d := $(info run_O $(run_O))
@@ -267,13 +275,13 @@ endif
  
  MAKEFLAGS := --no-print-directory
  
-clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null)
+clean := @(cd $(PERF); $(MAKE_F) -s $(O_OPT) clean >/dev/null)
  
  $(run):
         $(call clean)
         @TMP_DEST=$$(mktemp -d); \
-       cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \
-       echo "- $@: $$cmd" && echo $$cmd > $@ && \
+       cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
         ( eval $$cmd ) >> $@ 2>&1; \
         echo "  test: $(call test,$@)" >> $@ 2>&1; \
         $(call test,$@) && \
@@ -283,8 +291,8 @@ $(run_O):
         $(call clean)
         @TMP_O=$$(mktemp -d); \
         TMP_DEST=$$(mktemp -d); \
-       cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
-       echo "- $@: $$cmd" && echo $$cmd > $@ && \
+       cmd="cd $(PERF) && $(MAKE_F) $($(patsubst %_O,%,$@)) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
         ( eval $$cmd ) >> $@ 2>&1 && \
         echo "  test: $(call test_O,$@)" >> $@ 2>&1; \
         $(call test_O,$@) && \
@@ -313,11 +321,43 @@ make_kernelsrc_tools:
         (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \
         test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false)
  
+FEATURES_DUMP_FILE := $(FULL_O)/BUILD_TEST_FEATURE_DUMP
+FEATURES_DUMP_FILE_STATIC := $(FULL_O)/BUILD_TEST_FEATURE_DUMP_STATIC
+
  all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
         @echo OK
+       @rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
  
  out: $(run_O)
         @echo OK
+       @rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
+
+ifeq ($(REUSE_FEATURES_DUMP),1)
+$(FEATURES_DUMP_FILE):
+       $(call clean)
+       @cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) feature-dump"; \
+       echo "- $@: $$cmd" && echo $$cmd && \
+       ( eval $$cmd ) > /dev/null 2>&1
+
+$(FEATURES_DUMP_FILE_STATIC):
+       $(call clean)
+       @cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) LDFLAGS='-static' feature-dump"; \
+       echo "- $@: $$cmd" && echo $$cmd && \
+       ( eval $$cmd ) > /dev/null 2>&1
+
+# Add feature dump dependency for run/run_O targets
+$(foreach t,$(run) $(run_O),$(eval \
+       $(t): $(if $(findstring make_static,$(t)),\
+               $(FEATURES_DUMP_FILE_STATIC),\
+               $(FEATURES_DUMP_FILE))))
+
+# Append 'FEATURES_DUMP=' option to all test cases. For example:
+# make_no_libbpf: NO_LIBBPF=1  --> NO_LIBBPF=1 FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP
+# make_static: LDFLAGS=-static --> LDFLAGS=-static FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP_STATIC
+$(foreach t,$(run),$(if $(findstring make_static,$(t)),\
+                       $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE_STATIC)),\
+                       $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE))))
+endif
  
  .PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools
  endif # ifndef MK
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c

index abe8849d1d7030bda2ae65f770424e3c9ad84330..7865f68dc0d82bea12c960c61792e9fd28857305 100644 (file)
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1271,6 +1271,38 @@ static int test__checkevent_precise_max_modifier(struct perf_evlist *evlist)
         return 0;
  }
  
+static int test__checkevent_config_symbol(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "insn") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_raw(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "rawpmu") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_num(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "numpmu") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_cache(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "cachepmu") == 0);
+       return 0;
+}
+
  static int count_tracepoints(void)
  {
         struct dirent *events_ent;
@@ -1579,6 +1611,26 @@ static struct evlist_test test__events[] = {
                 .check = test__checkevent_precise_max_modifier,
                 .id    = 47,
         },
+       {
+               .name  = "instructions/name=insn/",
+               .check = test__checkevent_config_symbol,
+               .id    = 48,
+       },
+       {
+               .name  = "r1234/name=rawpmu/",
+               .check = test__checkevent_config_raw,
+               .id    = 49,
+       },
+       {
+               .name  = "4:0x6530160/name=numpmu/",
+               .check = test__checkevent_config_num,
+               .id    = 50,
+       },
+       {
+               .name  = "L1-dcache-misses/name=cachepmu/",
+               .check = test__checkevent_config_cache,
+               .id    = 51,
+       },
  };
  
  static struct evlist_test test__events_pmu[] = {
@@ -1666,7 +1718,7 @@ static int test_term(struct terms_test *t)
         }
  
         ret = t->check(&terms);
-       parse_events__free_terms(&terms);
+       parse_events_terms__purge(&terms);
  
         return ret;
  }
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c

index f0bfc9e8fd9f617d69c0b3cb36fe6c210ebbd6d5..630b0b409b973f87ba00312e5e4e3d8fdf829f32 100644 (file)
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -110,7 +110,6 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
          */
         for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
                 struct symbol *pair, *first_pair;
-               bool backwards = true;
  
                 sym  = rb_entry(nd, struct symbol, rb_node);
  
@@ -151,27 +150,14 @@ next_pair:
                                 continue;
  
                         } else {
-                               struct rb_node *nnd;
-detour:
-                               nnd = backwards ? rb_prev(&pair->rb_node) :
-                                                 rb_next(&pair->rb_node);
-                               if (nnd) {
-                                       struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
-
-                                       if (UM(next->start) == mem_start) {
-                                               pair = next;
+                               pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL, NULL);
+                               if (pair) {
+                                       if (UM(pair->start) == mem_start)
                                                 goto next_pair;
-                                       }
-                               }
  
-                               if (backwards) {
-                                       backwards = false;
-                                       pair = first_pair;
-                                       goto detour;
+                                       pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+                                                mem_start, sym->name, pair->name);
                                 }
-
-                               pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
-                                        mem_start, sym->name, pair->name);
                         }
                 } else
                         pr_debug("%#" PRIx64 ": %s not on kallsyms\n",
diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c

index d37202121689a017b0790d552e088d695716c527..af68a9d488bfce964c84e67cf5394e7e13daab29 100644 (file)
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -531,8 +531,8 @@ static struct ui_browser_colorset {
                 .bg       = "yellow",
         },
         {
-               .colorset = HE_COLORSET_CODE,
-               .name     = "code",
+               .colorset = HE_COLORSET_JUMP_ARROWS,
+               .name     = "jump_arrows",
                 .fg       = "blue",
                 .bg       = "default",
         },
diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h

index 01781de59532ce9c9fd1ff7f8c7fdf97d45fa105..be3b70eb5fca6e402830570c4111f10258702c04 100644 (file)
--- a/tools/perf/ui/browser.h
+++ b/tools/perf/ui/browser.h
@@ -7,7 +7,7 @@
  #define HE_COLORSET_MEDIUM     51
  #define HE_COLORSET_NORMAL     52
  #define HE_COLORSET_SELECTED   53
-#define HE_COLORSET_CODE       54
+#define HE_COLORSET_JUMP_ARROWS        54
  #define HE_COLORSET_ADDR       55
  #define HE_COLORSET_ROOT       56
  
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c

index 718bd46d47fa7bc88674a192dd1a8e8ab1fb7ca9..4fc208e82c6fc7b28d99af147d1c75738bc338e6 100644 (file)
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -284,7 +284,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
                 to = (u64)btarget->idx;
         }
  
-       ui_browser__set_color(browser, HE_COLORSET_CODE);
+       ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS);
         __ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width,
                                  from, to);
  }
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c

index 08c09ad755d2d2f8dd55be06b258e16fb2c8335f..4b98165559462025ded4db4a41fe8b5e66a9f151 100644 (file)
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -32,6 +32,7 @@ struct hist_browser {
         bool                 show_headers;
         float                min_pcnt;
         u64                  nr_non_filtered_entries;
+       u64                  nr_hierarchy_entries;
         u64                  nr_callchain_rows;
  };
  
@@ -58,11 +59,11 @@ static int hist_browser__get_folding(struct hist_browser *browser)
  
         for (nd = rb_first(&hists->entries);
              (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-            nd = rb_next(nd)) {
+            nd = rb_hierarchy_next(nd)) {
                 struct hist_entry *he =
                         rb_entry(nd, struct hist_entry, rb_node);
  
-               if (he->unfolded)
+               if (he->leaf && he->unfolded)
                         unfolded_rows += he->nr_rows;
         }
         return unfolded_rows;
@@ -72,7 +73,9 @@ static u32 hist_browser__nr_entries(struct hist_browser *hb)
  {
         u32 nr_entries;
  
-       if (hist_browser__has_filter(hb))
+       if (symbol_conf.report_hierarchy)
+               nr_entries = hb->nr_hierarchy_entries;
+       else if (hist_browser__has_filter(hb))
                 nr_entries = hb->nr_non_filtered_entries;
         else
                 nr_entries = hb->hists->nr_entries;
@@ -247,6 +250,38 @@ static int callchain__count_rows(struct rb_root *chain)
         return n;
  }
  
+static int hierarchy_count_rows(struct hist_browser *hb, struct hist_entry *he,
+                               bool include_children)
+{
+       int count = 0;
+       struct rb_node *node;
+       struct hist_entry *child;
+
+       if (he->leaf)
+               return callchain__count_rows(&he->sorted_chain);
+
+       if (he->has_no_entry)
+               return 1;
+
+       node = rb_first(&he->hroot_out);
+       while (node) {
+               float percent;
+
+               child = rb_entry(node, struct hist_entry, rb_node);
+               percent = hist_entry__get_percent_limit(child);
+
+               if (!child->filtered && percent >= hb->min_pcnt) {
+                       count++;
+
+                       if (include_children && child->unfolded)
+                               count += hierarchy_count_rows(hb, child, true);
+               }
+
+               node = rb_next(node);
+       }
+       return count;
+}
+
  static bool hist_entry__toggle_fold(struct hist_entry *he)
  {
         if (!he)
@@ -326,11 +361,17 @@ static void callchain__init_have_children(struct rb_root *root)
  
  static void hist_entry__init_have_children(struct hist_entry *he)
  {
-       if (!he->init_have_children) {
+       if (he->init_have_children)
+               return;
+
+       if (he->leaf) {
                 he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
                 callchain__init_have_children(&he->sorted_chain);
-               he->init_have_children = true;
+       } else {
+               he->has_children = !RB_EMPTY_ROOT(&he->hroot_out);
         }
+
+       he->init_have_children = true;
  }
  
  static bool hist_browser__toggle_fold(struct hist_browser *browser)
@@ -349,17 +390,49 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
                 has_children = callchain_list__toggle_fold(cl);
  
         if (has_children) {
+               int child_rows = 0;
+
                 hist_entry__init_have_children(he);
                 browser->b.nr_entries -= he->nr_rows;
-               browser->nr_callchain_rows -= he->nr_rows;
  
-               if (he->unfolded)
-                       he->nr_rows = callchain__count_rows(&he->sorted_chain);
+               if (he->leaf)
+                       browser->nr_callchain_rows -= he->nr_rows;
                 else
+                       browser->nr_hierarchy_entries -= he->nr_rows;
+
+               if (symbol_conf.report_hierarchy)
+                       child_rows = hierarchy_count_rows(browser, he, true);
+
+               if (he->unfolded) {
+                       if (he->leaf)
+                               he->nr_rows = callchain__count_rows(&he->sorted_chain);
+                       else
+                               he->nr_rows = hierarchy_count_rows(browser, he, false);
+
+                       /* account grand children */
+                       if (symbol_conf.report_hierarchy)
+                               browser->b.nr_entries += child_rows - he->nr_rows;
+
+                       if (!he->leaf && he->nr_rows == 0) {
+                               he->has_no_entry = true;
+                               he->nr_rows = 1;
+                       }
+               } else {
+                       if (symbol_conf.report_hierarchy)
+                               browser->b.nr_entries -= child_rows - he->nr_rows;
+
+                       if (he->has_no_entry)
+                               he->has_no_entry = false;
+
                         he->nr_rows = 0;
+               }
  
                 browser->b.nr_entries += he->nr_rows;
-               browser->nr_callchain_rows += he->nr_rows;
+
+               if (he->leaf)
+                       browser->nr_callchain_rows += he->nr_rows;
+               else
+                       browser->nr_hierarchy_entries += he->nr_rows;
  
                 return true;
         }
@@ -422,13 +495,38 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
         return n;
  }
  
-static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
+static int hierarchy_set_folding(struct hist_browser *hb, struct hist_entry *he,
+                                bool unfold __maybe_unused)
+{
+       float percent;
+       struct rb_node *nd;
+       struct hist_entry *child;
+       int n = 0;
+
+       for (nd = rb_first(&he->hroot_out); nd; nd = rb_next(nd)) {
+               child = rb_entry(nd, struct hist_entry, rb_node);
+               percent = hist_entry__get_percent_limit(child);
+               if (!child->filtered && percent >= hb->min_pcnt)
+                       n++;
+       }
+
+       return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *he,
+                                   struct hist_browser *hb, bool unfold)
  {
         hist_entry__init_have_children(he);
         he->unfolded = unfold ? he->has_children : false;
  
         if (he->has_children) {
-               int n = callchain__set_folding(&he->sorted_chain, unfold);
+               int n;
+
+               if (he->leaf)
+                       n = callchain__set_folding(&he->sorted_chain, unfold);
+               else
+                       n = hierarchy_set_folding(hb, he, unfold);
+
                 he->nr_rows = unfold ? n : 0;
         } else
                 he->nr_rows = 0;
@@ -438,19 +536,38 @@ static void
  __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
  {
         struct rb_node *nd;
-       struct hists *hists = browser->hists;
+       struct hist_entry *he;
+       double percent;
  
-       for (nd = rb_first(&hists->entries);
-            (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-            nd = rb_next(nd)) {
-               struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
-               hist_entry__set_folding(he, unfold);
-               browser->nr_callchain_rows += he->nr_rows;
+       nd = rb_first(&browser->hists->entries);
+       while (nd) {
+               he = rb_entry(nd, struct hist_entry, rb_node);
+
+               /* set folding state even if it's currently folded */
+               nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+               hist_entry__set_folding(he, browser, unfold);
+
+               percent = hist_entry__get_percent_limit(he);
+               if (he->filtered || percent < browser->min_pcnt)
+                       continue;
+
+               if (!he->depth || unfold)
+                       browser->nr_hierarchy_entries++;
+               if (he->leaf)
+                       browser->nr_callchain_rows += he->nr_rows;
+               else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) {
+                       browser->nr_hierarchy_entries++;
+                       he->has_no_entry = true;
+                       he->nr_rows = 1;
+               } else
+                       he->has_no_entry = false;
         }
  }
  
  static void hist_browser__set_folding(struct hist_browser *browser, bool unfold)
  {
+       browser->nr_hierarchy_entries = 0;
         browser->nr_callchain_rows = 0;
         __hist_browser__set_folding(browser, unfold);
  
@@ -657,9 +774,24 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
         return 1;
  }
  
+static bool check_percent_display(struct rb_node *node, u64 parent_total)
+{
+       struct callchain_node *child;
+
+       if (node == NULL)
+               return false;
+
+       if (rb_next(node))
+               return true;
+
+       child = rb_entry(node, struct callchain_node, rb_node);
+       return callchain_cumul_hits(child) != parent_total;
+}
+
  static int hist_browser__show_callchain_flat(struct hist_browser *browser,
                                              struct rb_root *root,
                                              unsigned short row, u64 total,
+                                            u64 parent_total,
                                              print_callchain_entry_fn print,
                                              struct callchain_print_arg *arg,
                                              check_output_full_fn is_output_full)
@@ -669,7 +801,7 @@ static int hist_browser__show_callchain_flat(struct hist_browser *browser,
         bool need_percent;
  
         node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
  
         while (node) {
                 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -763,6 +895,7 @@ static char *hist_browser__folded_callchain_str(struct hist_browser *browser,
  static int hist_browser__show_callchain_folded(struct hist_browser *browser,
                                                struct rb_root *root,
                                                unsigned short row, u64 total,
+                                              u64 parent_total,
                                                print_callchain_entry_fn print,
                                                struct callchain_print_arg *arg,
                                                check_output_full_fn is_output_full)
@@ -772,7 +905,7 @@ static int hist_browser__show_callchain_folded(struct hist_browser *browser,
         bool need_percent;
  
         node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
  
         while (node) {
                 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -844,20 +977,24 @@ next:
         return row - first_row;
  }
  
-static int hist_browser__show_callchain(struct hist_browser *browser,
+static int hist_browser__show_callchain_graph(struct hist_browser *browser,
                                         struct rb_root *root, int level,
                                         unsigned short row, u64 total,
+                                       u64 parent_total,
                                         print_callchain_entry_fn print,
                                         struct callchain_print_arg *arg,
                                         check_output_full_fn is_output_full)
  {
         struct rb_node *node;
         int first_row = row, offset = level * LEVEL_OFFSET_STEP;
-       u64 new_total;
         bool need_percent;
+       u64 percent_total = total;
+
+       if (callchain_param.mode == CHAIN_GRAPH_REL)
+               percent_total = parent_total;
  
         node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
  
         while (node) {
                 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -878,7 +1015,7 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
                         folded_sign = callchain_list__folded(chain);
  
                         row += hist_browser__show_callchain_list(browser, child,
-                                                       chain, row, total,
+                                                       chain, row, percent_total,
                                                         was_first && need_percent,
                                                         offset + extra_offset,
                                                         print, arg);
@@ -893,13 +1030,9 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
                 if (folded_sign == '-') {
                         const int new_level = level + (extra_offset ? 2 : 1);
  
-                       if (callchain_param.mode == CHAIN_GRAPH_REL)
-                               new_total = child->children_hit;
-                       else
-                               new_total = total;
-
-                       row += hist_browser__show_callchain(browser, &child->rb_root,
-                                                           new_level, row, new_total,
+                       row += hist_browser__show_callchain_graph(browser, &child->rb_root,
+                                                           new_level, row, total,
+                                                           child->children_hit,
                                                             print, arg, is_output_full);
                 }
                 if (is_output_full(browser, row))
@@ -910,6 +1043,45 @@ out:
         return row - first_row;
  }
  
+static int hist_browser__show_callchain(struct hist_browser *browser,
+                                       struct hist_entry *entry, int level,
+                                       unsigned short row,
+                                       print_callchain_entry_fn print,
+                                       struct callchain_print_arg *arg,
+                                       check_output_full_fn is_output_full)
+{
+       u64 total = hists__total_period(entry->hists);
+       u64 parent_total;
+       int printed;
+
+       if (symbol_conf.cumulate_callchain)
+               parent_total = entry->stat_acc->period;
+       else
+               parent_total = entry->stat.period;
+
+       if (callchain_param.mode == CHAIN_FLAT) {
+               printed = hist_browser__show_callchain_flat(browser,
+                                               &entry->sorted_chain, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       } else if (callchain_param.mode == CHAIN_FOLDED) {
+               printed = hist_browser__show_callchain_folded(browser,
+                                               &entry->sorted_chain, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       } else {
+               printed = hist_browser__show_callchain_graph(browser,
+                                               &entry->sorted_chain, level, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       }
+
+       if (arg->is_current_entry)
+               browser->he_selection = entry;
+
+       return printed;
+}
+
  struct hpp_arg {
         struct ui_browser *b;
         char folded_sign;
@@ -1006,7 +1178,6 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                                     struct hist_entry *entry,
                                     unsigned short row)
  {
-       char s[256];
         int printed = 0;
         int width = browser->b.width;
         char folded_sign = ' ';
@@ -1031,16 +1202,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                         .folded_sign    = folded_sign,
                         .current_entry  = current_entry,
                 };
-               struct perf_hpp hpp = {
-                       .buf            = s,
-                       .size           = sizeof(s),
-                       .ptr            = &arg,
-               };
                 int column = 0;
  
                 hist_browser__gotorc(browser, row, 0);
  
-               perf_hpp__for_each_format(fmt) {
+               hists__for_each_format(browser->hists, fmt) {
+                       char s[2048];
+                       struct perf_hpp hpp = {
+                               .buf    = s,
+                               .size   = sizeof(s),
+                               .ptr    = &arg,
+                       };
+
                         if (perf_hpp__should_skip(fmt, entry->hists) ||
                             column++ < browser->b.horiz_scroll)
                                 continue;
@@ -1065,11 +1238,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                         }
  
                         if (fmt->color) {
-                               width -= fmt->color(fmt, &hpp, entry);
+                               int ret = fmt->color(fmt, &hpp, entry);
+                               hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                               /*
+                                * fmt->color() already used ui_browser to
+                                * print the non alignment bits, skip it (+ret):
+                                */
+                               ui_browser__printf(&browser->b, "%s", s + ret);
                         } else {
-                               width -= fmt->entry(fmt, &hpp, entry);
+                               hist_entry__snprintf_alignment(entry, &hpp, fmt, fmt->entry(fmt, &hpp, entry));
                                 ui_browser__printf(&browser->b, "%s", s);
                         }
+                       width -= hpp.buf - s;
                 }
  
                 /* The scroll bar isn't being used */
@@ -1084,43 +1264,246 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                 --row_offset;
  
         if (folded_sign == '-' && row != browser->b.rows) {
-               u64 total = hists__total_period(entry->hists);
                 struct callchain_print_arg arg = {
                         .row_offset = row_offset,
                         .is_current_entry = current_entry,
                 };
  
-               if (callchain_param.mode == CHAIN_GRAPH_REL) {
-                       if (symbol_conf.cumulate_callchain)
-                               total = entry->stat_acc->period;
-                       else
-                               total = entry->stat.period;
-               }
-
-               if (callchain_param.mode == CHAIN_FLAT) {
-                       printed += hist_browser__show_callchain_flat(browser,
-                                       &entry->sorted_chain, row, total,
-                                       hist_browser__show_callchain_entry, &arg,
-                                       hist_browser__check_output_full);
-               } else if (callchain_param.mode == CHAIN_FOLDED) {
-                       printed += hist_browser__show_callchain_folded(browser,
-                                       &entry->sorted_chain, row, total,
+               printed += hist_browser__show_callchain(browser, entry, 1, row,
                                         hist_browser__show_callchain_entry, &arg,
                                         hist_browser__check_output_full);
+       }
+
+       return printed;
+}
+
+static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
+                                             struct hist_entry *entry,
+                                             unsigned short row,
+                                             int level)
+{
+       int printed = 0;
+       int width = browser->b.width;
+       char folded_sign = ' ';
+       bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+       off_t row_offset = entry->row_offset;
+       bool first = true;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       struct hpp_arg arg = {
+               .b              = &browser->b,
+               .current_entry  = current_entry,
+       };
+       int column = 0;
+       int hierarchy_indent = (entry->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
+
+       if (current_entry) {
+               browser->he_selection = entry;
+               browser->selection = &entry->ms;
+       }
+
+       hist_entry__init_have_children(entry);
+       folded_sign = hist_entry__folded(entry);
+       arg.folded_sign = folded_sign;
+
+       if (entry->leaf && row_offset) {
+               row_offset--;
+               goto show_callchain;
+       }
+
+       hist_browser__gotorc(browser, row, 0);
+
+       if (current_entry && browser->b.navkeypressed)
+               ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+       else
+               ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+       ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+       width -= level * HIERARCHY_INDENT;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&entry->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               char s[2048];
+               struct perf_hpp hpp = {
+                       .buf            = s,
+                       .size           = sizeof(s),
+                       .ptr            = &arg,
+               };
+
+               if (perf_hpp__should_skip(fmt, entry->hists) ||
+                   column++ < browser->b.horiz_scroll)
+                       continue;
+
+               if (current_entry && browser->b.navkeypressed) {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_SELECTED);
                 } else {
-                       printed += hist_browser__show_callchain(browser,
-                                       &entry->sorted_chain, 1, row, total,
-                                       hist_browser__show_callchain_entry, &arg,
-                                       hist_browser__check_output_full);
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_NORMAL);
+               }
+
+               if (first) {
+                       ui_browser__printf(&browser->b, "%c", folded_sign);
+                       width--;
+                       first = false;
+               } else {
+                       ui_browser__printf(&browser->b, "  ");
+                       width -= 2;
+               }
+
+               if (fmt->color) {
+                       int ret = fmt->color(fmt, &hpp, entry);
+                       hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                       /*
+                        * fmt->color() already used ui_browser to
+                        * print the non alignment bits, skip it (+ret):
+                        */
+                       ui_browser__printf(&browser->b, "%s", s + ret);
+               } else {
+                       int ret = fmt->entry(fmt, &hpp, entry);
+                       hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                       ui_browser__printf(&browser->b, "%s", s);
+               }
+               width -= hpp.buf - s;
+       }
+
+       ui_browser__write_nstring(&browser->b, "", hierarchy_indent);
+       width -= hierarchy_indent;
+
+       if (column >= browser->b.horiz_scroll) {
+               char s[2048];
+               struct perf_hpp hpp = {
+                       .buf            = s,
+                       .size           = sizeof(s),
+                       .ptr            = &arg,
+               };
+
+               if (current_entry && browser->b.navkeypressed) {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_SELECTED);
+               } else {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_NORMAL);
                 }
  
-               if (arg.is_current_entry)
-                       browser->he_selection = entry;
+               perf_hpp_list__for_each_format(entry->hpp_list, fmt) {
+                       ui_browser__write_nstring(&browser->b, "", 2);
+                       width -= 2;
+
+                       /*
+                        * No need to call hist_entry__snprintf_alignment()
+                        * since this fmt is always the last column in the
+                        * hierarchy mode.
+                        */
+                       if (fmt->color) {
+                               width -= fmt->color(fmt, &hpp, entry);
+                       } else {
+                               int i = 0;
+
+                               width -= fmt->entry(fmt, &hpp, entry);
+                               ui_browser__printf(&browser->b, "%s", ltrim(s));
+
+                               while (isspace(s[i++]))
+                                       width++;
+                       }
+               }
+       }
+
+       /* The scroll bar isn't being used */
+       if (!browser->b.navkeypressed)
+               width += 1;
+
+       ui_browser__write_nstring(&browser->b, "", width);
+
+       ++row;
+       ++printed;
+
+show_callchain:
+       if (entry->leaf && folded_sign == '-' && row != browser->b.rows) {
+               struct callchain_print_arg carg = {
+                       .row_offset = row_offset,
+               };
+
+               printed += hist_browser__show_callchain(browser, entry,
+                                       level + 1, row,
+                                       hist_browser__show_callchain_entry, &carg,
+                                       hist_browser__check_output_full);
         }
  
         return printed;
  }
  
+static int hist_browser__show_no_entry(struct hist_browser *browser,
+                                      unsigned short row, int level)
+{
+       int width = browser->b.width;
+       bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+       bool first = true;
+       int column = 0;
+       int ret;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       int indent = browser->hists->nr_hpp_node - 2;
+
+       if (current_entry) {
+               browser->he_selection = NULL;
+               browser->selection = NULL;
+       }
+
+       hist_browser__gotorc(browser, row, 0);
+
+       if (current_entry && browser->b.navkeypressed)
+               ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+       else
+               ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+       ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+       width -= level * HIERARCHY_INDENT;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&browser->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (perf_hpp__should_skip(fmt, browser->hists) ||
+                   column++ < browser->b.horiz_scroll)
+                       continue;
+
+               ret = fmt->width(fmt, NULL, hists_to_evsel(browser->hists));
+
+               if (first) {
+                       /* for folded sign */
+                       first = false;
+                       ret++;
+               } else {
+                       /* space between columns */
+                       ret += 2;
+               }
+
+               ui_browser__write_nstring(&browser->b, "", ret);
+               width -= ret;
+       }
+
+       ui_browser__write_nstring(&browser->b, "", indent * HIERARCHY_INDENT);
+       width -= indent * HIERARCHY_INDENT;
+
+       if (column >= browser->b.horiz_scroll) {
+               char buf[32];
+
+               ret = snprintf(buf, sizeof(buf), "no entry >= %.2f%%", browser->min_pcnt);
+               ui_browser__printf(&browser->b, "  %s", buf);
+               width -= ret + 2;
+       }
+
+       /* The scroll bar isn't being used */
+       if (!browser->b.navkeypressed)
+               width += 1;
+
+       ui_browser__write_nstring(&browser->b, "", width);
+       return 1;
+}
+
  static int advance_hpp_check(struct perf_hpp *hpp, int inc)
  {
         advance_hpp(hpp, inc);
@@ -1144,7 +1527,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
                         return ret;
         }
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
                         continue;
  
@@ -1160,11 +1543,96 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
         return ret;
  }
  
+static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *browser, char *buf, size_t size)
+{
+       struct hists *hists = browser->hists;
+       struct perf_hpp dummy_hpp = {
+               .buf    = buf,
+               .size   = size,
+       };
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       size_t ret = 0;
+       int column = 0;
+       int indent = hists->nr_hpp_node - 2;
+       bool first_node, first_col;
+
+       ret = scnprintf(buf, size, " ");
+       if (advance_hpp_check(&dummy_hpp, ret))
+               return ret;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (column++ < browser->b.horiz_scroll)
+                       continue;
+
+               ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+               if (advance_hpp_check(&dummy_hpp, ret))
+                       break;
+
+               ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "  ");
+               if (advance_hpp_check(&dummy_hpp, ret))
+                       break;
+       }
+
+       ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "%*s",
+                       indent * HIERARCHY_INDENT, "");
+       if (advance_hpp_check(&dummy_hpp, ret))
+               return ret;
+
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node) {
+                       ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, " / ");
+                       if (advance_hpp_check(&dummy_hpp, ret))
+                               break;
+               }
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       char *start;
+
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col) {
+                               ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "+");
+                               if (advance_hpp_check(&dummy_hpp, ret))
+                                       break;
+                       }
+                       first_col = false;
+
+                       ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+                       dummy_hpp.buf[ret] = '\0';
+                       rtrim(dummy_hpp.buf);
+
+                       start = ltrim(dummy_hpp.buf);
+                       ret = strlen(start);
+
+                       if (start != dummy_hpp.buf)
+                               memmove(dummy_hpp.buf, start, ret + 1);
+
+                       if (advance_hpp_check(&dummy_hpp, ret))
+                               break;
+               }
+       }
+
+       return ret;
+}
+
  static void hist_browser__show_headers(struct hist_browser *browser)
  {
         char headers[1024];
  
-       hists_browser__scnprintf_headers(browser, headers, sizeof(headers));
+       if (symbol_conf.report_hierarchy)
+               hists_browser__scnprintf_hierarchy_headers(browser, headers,
+                                                          sizeof(headers));
+       else
+               hists_browser__scnprintf_headers(browser, headers,
+                                                sizeof(headers));
         ui_browser__gotorc(&browser->b, 0, 0);
         ui_browser__set_color(&browser->b, HE_COLORSET_ROOT);
         ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1);
@@ -1196,18 +1664,34 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
         hb->he_selection = NULL;
         hb->selection = NULL;
  
-       for (nd = browser->top; nd; nd = rb_next(nd)) {
+       for (nd = browser->top; nd; nd = rb_hierarchy_next(nd)) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
                 float percent;
  
-               if (h->filtered)
+               if (h->filtered) {
+                       /* let it move to sibling */
+                       h->unfolded = false;
                         continue;
+               }
  
                 percent = hist_entry__get_percent_limit(h);
                 if (percent < hb->min_pcnt)
                         continue;
  
-               row += hist_browser__show_entry(hb, h, row);
+               if (symbol_conf.report_hierarchy) {
+                       row += hist_browser__show_hierarchy_entry(hb, h, row,
+                                                                 h->depth);
+                       if (row == browser->rows)
+                               break;
+
+                       if (h->has_no_entry) {
+                               hist_browser__show_no_entry(hb, row, h->depth + 1);
+                               row++;
+                       }
+               } else {
+                       row += hist_browser__show_entry(hb, h, row);
+               }
+
                 if (row == browser->rows)
                         break;
         }
@@ -1225,7 +1709,14 @@ static struct rb_node *hists__filter_entries(struct rb_node *nd,
                 if (!h->filtered && percent >= min_pcnt)
                         return nd;
  
-               nd = rb_next(nd);
+               /*
+                * If it's filtered, its all children also were filtered.
+                * So move to sibling node.
+                */
+               if (rb_next(nd))
+                       nd = rb_next(nd);
+               else
+                       nd = rb_hierarchy_next(nd);
         }
  
         return NULL;
@@ -1241,7 +1732,7 @@ static struct rb_node *hists__filter_prev_entries(struct rb_node *nd,
                 if (!h->filtered && percent >= min_pcnt)
                         return nd;
  
-               nd = rb_prev(nd);
+               nd = rb_hierarchy_prev(nd);
         }
  
         return NULL;
@@ -1271,8 +1762,8 @@ static void ui_browser__hists_seek(struct ui_browser *browser,
                 nd = browser->top;
                 goto do_offset;
         case SEEK_END:
-               nd = hists__filter_prev_entries(rb_last(browser->entries),
-                                               hb->min_pcnt);
+               nd = rb_hierarchy_last(rb_last(browser->entries));
+               nd = hists__filter_prev_entries(nd, hb->min_pcnt);
                 first = false;
                 break;
         default:
@@ -1306,7 +1797,7 @@ do_offset:
         if (offset > 0) {
                 do {
                         h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->unfolded) {
+                       if (h->unfolded && h->leaf) {
                                 u16 remaining = h->nr_rows - h->row_offset;
                                 if (offset > remaining) {
                                         offset -= remaining;
@@ -1318,7 +1809,8 @@ do_offset:
                                         break;
                                 }
                         }
-                       nd = hists__filter_entries(rb_next(nd), hb->min_pcnt);
+                       nd = hists__filter_entries(rb_hierarchy_next(nd),
+                                                  hb->min_pcnt);
                         if (nd == NULL)
                                 break;
                         --offset;
@@ -1327,7 +1819,7 @@ do_offset:
         } else if (offset < 0) {
                 while (1) {
                         h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->unfolded) {
+                       if (h->unfolded && h->leaf) {
                                 if (first) {
                                         if (-offset > h->row_offset) {
                                                 offset += h->row_offset;
@@ -1351,7 +1843,7 @@ do_offset:
                                 }
                         }
  
-                       nd = hists__filter_prev_entries(rb_prev(nd),
+                       nd = hists__filter_prev_entries(rb_hierarchy_prev(nd),
                                                         hb->min_pcnt);
                         if (nd == NULL)
                                 break;
@@ -1364,7 +1856,7 @@ do_offset:
                                  * row_offset at its last entry.
                                  */
                                 h = rb_entry(nd, struct hist_entry, rb_node);
-                               if (h->unfolded)
+                               if (h->unfolded && h->leaf)
                                         h->row_offset = h->nr_rows;
                                 break;
                         }
@@ -1378,17 +1870,14 @@ do_offset:
  }
  
  static int hist_browser__fprintf_callchain(struct hist_browser *browser,
-                                          struct hist_entry *he, FILE *fp)
+                                          struct hist_entry *he, FILE *fp,
+                                          int level)
  {
-       u64 total = hists__total_period(he->hists);
         struct callchain_print_arg arg  = {
                 .fp = fp,
         };
  
-       if (symbol_conf.cumulate_callchain)
-               total = he->stat_acc->period;
-
-       hist_browser__show_callchain(browser, &he->sorted_chain, 1, 0, total,
+       hist_browser__show_callchain(browser, he, level, 0,
                                      hist_browser__fprintf_callchain_entry, &arg,
                                      hist_browser__check_dump_full);
         return arg.printed;
@@ -1414,7 +1903,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
         if (symbol_conf.use_callchain)
                 printed += fprintf(fp, "%c ", folded_sign);
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                 if (perf_hpp__should_skip(fmt, he->hists))
                         continue;
  
@@ -1425,12 +1914,71 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
                         first = false;
  
                 ret = fmt->entry(fmt, &hpp, he);
+               ret = hist_entry__snprintf_alignment(he, &hpp, fmt, ret);
                 advance_hpp(&hpp, ret);
         }
-       printed += fprintf(fp, "%s\n", rtrim(s));
+       printed += fprintf(fp, "%s\n", s);
  
         if (folded_sign == '-')
-               printed += hist_browser__fprintf_callchain(browser, he, fp);
+               printed += hist_browser__fprintf_callchain(browser, he, fp, 1);
+
+       return printed;
+}
+
+
+static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
+                                                struct hist_entry *he,
+                                                FILE *fp, int level)
+{
+       char s[8192];
+       int printed = 0;
+       char folded_sign = ' ';
+       struct perf_hpp hpp = {
+               .buf = s,
+               .size = sizeof(s),
+       };
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       bool first = true;
+       int ret;
+       int hierarchy_indent = (he->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
+
+       printed = fprintf(fp, "%*s", level * HIERARCHY_INDENT, "");
+
+       folded_sign = hist_entry__folded(he);
+       printed += fprintf(fp, "%c", folded_sign);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&he->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (!first) {
+                       ret = scnprintf(hpp.buf, hpp.size, "  ");
+                       advance_hpp(&hpp, ret);
+               } else
+                       first = false;
+
+               ret = fmt->entry(fmt, &hpp, he);
+               advance_hpp(&hpp, ret);
+       }
+
+       ret = scnprintf(hpp.buf, hpp.size, "%*s", hierarchy_indent, "");
+       advance_hpp(&hpp, ret);
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               ret = scnprintf(hpp.buf, hpp.size, "  ");
+               advance_hpp(&hpp, ret);
+
+               ret = fmt->entry(fmt, &hpp, he);
+               advance_hpp(&hpp, ret);
+       }
+
+       printed += fprintf(fp, "%s\n", rtrim(s));
+
+       if (he->leaf && folded_sign == '-') {
+               printed += hist_browser__fprintf_callchain(browser, he, fp,
+                                                          he->depth + 1);
+       }
  
         return printed;
  }
@@ -1444,8 +1992,16 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
         while (nd) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
  
-               printed += hist_browser__fprintf_entry(browser, h, fp);
-               nd = hists__filter_entries(rb_next(nd), browser->min_pcnt);
+               if (symbol_conf.report_hierarchy) {
+                       printed += hist_browser__fprintf_hierarchy_entry(browser,
+                                                                        h, fp,
+                                                                        h->depth);
+               } else {
+                       printed += hist_browser__fprintf_entry(browser, h, fp);
+               }
+
+               nd = hists__filter_entries(rb_hierarchy_next(nd),
+                                          browser->min_pcnt);
         }
  
         return printed;
@@ -1580,11 +2136,18 @@ static int hists__browser_title(struct hists *hists,
         if (hists->uid_filter_str)
                 printed += snprintf(bf + printed, size - printed,
                                     ", UID: %s", hists->uid_filter_str);
-       if (thread)
-               printed += scnprintf(bf + printed, size - printed,
+       if (thread) {
+               if (sort__has_thread) {
+                       printed += scnprintf(bf + printed, size - printed,
                                     ", Thread: %s(%d)",
                                      (thread->comm_set ? thread__comm_str(thread) : ""),
                                     thread->tid);
+               } else {
+                       printed += scnprintf(bf + printed, size - printed,
+                                   ", Thread: %s",
+                                    (thread->comm_set ? thread__comm_str(thread) : ""));
+               }
+       }
         if (dso)
                 printed += scnprintf(bf + printed, size - printed,
                                     ", DSO: %s", dso->short_name);
@@ -1759,15 +2322,24 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
  {
         struct thread *thread = act->thread;
  
+       if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+               return 0;
+
         if (browser->hists->thread_filter) {
                 pstack__remove(browser->pstack, &browser->hists->thread_filter);
                 perf_hpp__set_elide(HISTC_THREAD, false);
                 thread__zput(browser->hists->thread_filter);
                 ui_helpline__pop();
         } else {
-               ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
-                                  thread->comm_set ? thread__comm_str(thread) : "",
-                                  thread->tid);
+               if (sort__has_thread) {
+                       ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
+                                          thread->comm_set ? thread__comm_str(thread) : "",
+                                          thread->tid);
+               } else {
+                       ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s thread\"",
+                                          thread->comm_set ? thread__comm_str(thread) : "");
+               }
+
                 browser->hists->thread_filter = thread__get(thread);
                 perf_hpp__set_elide(HISTC_THREAD, false);
                 pstack__push(browser->pstack, &browser->hists->thread_filter);
@@ -1782,13 +2354,22 @@ static int
  add_thread_opt(struct hist_browser *browser, struct popup_action *act,
                char **optstr, struct thread *thread)
  {
-       if (thread == NULL)
+       int ret;
+
+       if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
                 return 0;
  
-       if (asprintf(optstr, "Zoom %s %s(%d) thread",
-                    browser->hists->thread_filter ? "out of" : "into",
-                    thread->comm_set ? thread__comm_str(thread) : "",
-                    thread->tid) < 0)
+       if (sort__has_thread) {
+               ret = asprintf(optstr, "Zoom %s %s(%d) thread",
+                              browser->hists->thread_filter ? "out of" : "into",
+                              thread->comm_set ? thread__comm_str(thread) : "",
+                              thread->tid);
+       } else {
+               ret = asprintf(optstr, "Zoom %s %s thread",
+                              browser->hists->thread_filter ? "out of" : "into",
+                              thread->comm_set ? thread__comm_str(thread) : "");
+       }
+       if (ret < 0)
                 return 0;
  
         act->thread = thread;
@@ -1801,6 +2382,9 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
  {
         struct map *map = act->ms.map;
  
+       if (!sort__has_dso || map == NULL)
+               return 0;
+
         if (browser->hists->dso_filter) {
                 pstack__remove(browser->pstack, &browser->hists->dso_filter);
                 perf_hpp__set_elide(HISTC_DSO, false);
@@ -1825,7 +2409,7 @@ static int
  add_dso_opt(struct hist_browser *browser, struct popup_action *act,
             char **optstr, struct map *map)
  {
-       if (map == NULL)
+       if (!sort__has_dso || map == NULL)
                 return 0;
  
         if (asprintf(optstr, "Zoom %s %s DSO",
@@ -1850,7 +2434,7 @@ static int
  add_map_opt(struct hist_browser *browser __maybe_unused,
             struct popup_action *act, char **optstr, struct map *map)
  {
-       if (map == NULL)
+       if (!sort__has_dso || map == NULL)
                 return 0;
  
         if (asprintf(optstr, "Browse map details") < 0)
@@ -1952,6 +2536,9 @@ add_exit_opt(struct hist_browser *browser __maybe_unused,
  static int
  do_zoom_socket(struct hist_browser *browser, struct popup_action *act)
  {
+       if (!sort__has_socket || act->socket < 0)
+               return 0;
+
         if (browser->hists->socket_filter > -1) {
                 pstack__remove(browser->pstack, &browser->hists->socket_filter);
                 browser->hists->socket_filter = -1;
@@ -1971,7 +2558,7 @@ static int
  add_socket_opt(struct hist_browser *browser, struct popup_action *act,
                char **optstr, int socket_id)
  {
-       if (socket_id < 0)
+       if (!sort__has_socket || socket_id < 0)
                 return 0;
  
         if (asprintf(optstr, "Zoom %s Processor Socket %d",
@@ -1989,17 +2576,60 @@ static void hist_browser__update_nr_entries(struct hist_browser *hb)
         u64 nr_entries = 0;
         struct rb_node *nd = rb_first(&hb->hists->entries);
  
-       if (hb->min_pcnt == 0) {
+       if (hb->min_pcnt == 0 && !symbol_conf.report_hierarchy) {
                 hb->nr_non_filtered_entries = hb->hists->nr_non_filtered_entries;
                 return;
         }
  
         while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
                 nr_entries++;
-               nd = rb_next(nd);
+               nd = rb_hierarchy_next(nd);
         }
  
         hb->nr_non_filtered_entries = nr_entries;
+       hb->nr_hierarchy_entries = nr_entries;
+}
+
+static void hist_browser__update_percent_limit(struct hist_browser *hb,
+                                              double percent)
+{
+       struct hist_entry *he;
+       struct rb_node *nd = rb_first(&hb->hists->entries);
+       u64 total = hists__total_period(hb->hists);
+       u64 min_callchain_hits = total * (percent / 100);
+
+       hb->min_pcnt = callchain_param.min_percent = percent;
+
+       while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
+               he = rb_entry(nd, struct hist_entry, rb_node);
+
+               if (he->has_no_entry) {
+                       he->has_no_entry = false;
+                       he->nr_rows = 0;
+               }
+
+               if (!he->leaf || !symbol_conf.use_callchain)
+                       goto next;
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (percent / 100);
+               }
+
+               callchain_param.sort(&he->sorted_chain, he->callchain,
+                                    min_callchain_hits, &callchain_param);
+
+next:
+               nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+               /* force to re-evaluate folding state of callchains */
+               he->init_have_children = false;
+               hist_entry__set_folding(he, hb, false);
+       }
  }
  
  static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
@@ -2037,6 +2667,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
         "E             Expand all callchains\n"                         \
         "F             Toggle percentage of filtered entries\n"         \
         "H             Display column headers\n"                        \
+       "L             Change percent limit\n"                          \
         "m             Display context menu\n"                          \
         "S             Zoom into current Processor Socket\n"            \
  
@@ -2077,7 +2708,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
         memset(options, 0, sizeof(options));
         memset(actions, 0, sizeof(actions));
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                 perf_hpp__reset_width(fmt, hists);
                 /*
                  * This is done just once, and activates the horizontal scrolling
@@ -2192,6 +2823,24 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                 top->zero = !top->zero;
                         }
                         continue;
+               case 'L':
+                       if (ui_browser__input_window("Percent Limit",
+                                       "Please enter the value you want to hide entries under that percent.",
+                                       buf, "ENTER: OK, ESC: Cancel",
+                                       delay_secs * 2) == K_ENTER) {
+                               char *end;
+                               double new_percent = strtod(buf, &end);
+
+                               if (new_percent < 0 || new_percent > 100) {
+                                       ui_browser__warning(&browser->b, delay_secs * 2,
+                                               "Invalid percent: %.2f", new_percent);
+                                       continue;
+                               }
+
+                               hist_browser__update_percent_limit(browser, new_percent);
+                               hist_browser__reset(browser);
+                       }
+                       continue;
                 case K_F1:
                 case 'h':
                 case '?':
@@ -2263,10 +2912,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                         continue;
                 }
  
-               if (!sort__has_sym)
-                       goto add_exit_option;
-
-               if (browser->selection == NULL)
+               if (!sort__has_sym || browser->selection == NULL)
                         goto skip_annotation;
  
                 if (sort__mode == SORT_MODE__BRANCH) {
@@ -2306,11 +2952,16 @@ skip_annotation:
                                              &options[nr_options],
                                              socked_id);
                 /* perf script support */
+               if (!is_report_browser(hbt))
+                       goto skip_scripting;
+
                 if (browser->he_selection) {
-                       nr_options += add_script_opt(browser,
-                                                    &actions[nr_options],
-                                                    &options[nr_options],
-                                                    thread, NULL);
+                       if (sort__has_thread && thread) {
+                               nr_options += add_script_opt(browser,
+                                                            &actions[nr_options],
+                                                            &options[nr_options],
+                                                            thread, NULL);
+                       }
                         /*
                          * Note that browser->selection != NULL
                          * when browser->he_selection is not NULL,
@@ -2320,16 +2971,18 @@ skip_annotation:
                          *
                          * See hist_browser__show_entry.
                          */
-                       nr_options += add_script_opt(browser,
-                                                    &actions[nr_options],
-                                                    &options[nr_options],
-                                                    NULL, browser->selection->sym);
+                       if (sort__has_sym && browser->selection->sym) {
+                               nr_options += add_script_opt(browser,
+                                                            &actions[nr_options],
+                                                            &options[nr_options],
+                                                            NULL, browser->selection->sym);
+                       }
                 }
                 nr_options += add_script_opt(browser, &actions[nr_options],
                                              &options[nr_options], NULL, NULL);
                 nr_options += add_switch_opt(browser, &actions[nr_options],
                                              &options[nr_options]);
-add_exit_option:
+skip_scripting:
                 nr_options += add_exit_opt(browser, &actions[nr_options],
                                            &options[nr_options]);
  
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c

index 0f8dcfdfb10f3856abefc7f252631e8937bacf6b..bd9bf7e343b1e310d20587d0b0f7b75c6b10a574 100644 (file)
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -306,7 +306,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
  
         nr_cols = 0;
  
-       perf_hpp__for_each_format(fmt)
+       hists__for_each_format(hists, fmt)
                 col_types[nr_cols++] = G_TYPE_STRING;
  
         store = gtk_tree_store_newv(nr_cols, col_types);
@@ -317,7 +317,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
  
         col_idx = 0;
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists))
                         continue;
  
@@ -367,7 +367,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
  
                 col_idx = 0;
  
-               perf_hpp__for_each_format(fmt) {
+               hists__for_each_format(hists, fmt) {
                         if (perf_hpp__should_skip(fmt, h->hists))
                                 continue;
  
@@ -396,6 +396,194 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
         gtk_container_add(GTK_CONTAINER(window), view);
  }
  
+static void perf_gtk__add_hierarchy_entries(struct hists *hists,
+                                           struct rb_root *root,
+                                           GtkTreeStore *store,
+                                           GtkTreeIter *parent,
+                                           struct perf_hpp *hpp,
+                                           float min_pcnt)
+{
+       int col_idx = 0;
+       struct rb_node *node;
+       struct hist_entry *he;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       u64 total = hists__total_period(hists);
+       int size;
+
+       for (node = rb_first(root); node; node = rb_next(node)) {
+               GtkTreeIter iter;
+               float percent;
+               char *bf;
+
+               he = rb_entry(node, struct hist_entry, rb_node);
+               if (he->filtered)
+                       continue;
+
+               percent = hist_entry__get_percent_limit(he);
+               if (percent < min_pcnt)
+                       continue;
+
+               gtk_tree_store_append(store, &iter, parent);
+
+               col_idx = 0;
+
+               /* the first hpp_list_node is for overhead columns */
+               fmt_node = list_first_entry(&hists->hpp_formats,
+                                           struct perf_hpp_list_node, list);
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (fmt->color)
+                               fmt->color(fmt, hpp, he);
+                       else
+                               fmt->entry(fmt, hpp, he);
+
+                       gtk_tree_store_set(store, &iter, col_idx++, hpp->buf, -1);
+               }
+
+               bf = hpp->buf;
+               size = hpp->size;
+               perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+                       int ret;
+
+                       if (fmt->color)
+                               ret = fmt->color(fmt, hpp, he);
+                       else
+                               ret = fmt->entry(fmt, hpp, he);
+
+                       snprintf(hpp->buf + ret, hpp->size - ret, "  ");
+                       advance_hpp(hpp, ret + 2);
+               }
+
+               gtk_tree_store_set(store, &iter, col_idx, ltrim(rtrim(bf)), -1);
+
+               if (!he->leaf) {
+                       hpp->buf = bf;
+                       hpp->size = size;
+
+                       perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
+                                                       store, &iter, hpp,
+                                                       min_pcnt);
+
+                       if (!hist_entry__has_hierarchy_children(he, min_pcnt)) {
+                               char buf[32];
+                               GtkTreeIter child;
+
+                               snprintf(buf, sizeof(buf), "no entry >= %.2f%%",
+                                        min_pcnt);
+
+                               gtk_tree_store_append(store, &child, &iter);
+                               gtk_tree_store_set(store, &child, col_idx, buf, -1);
+                       }
+               }
+
+               if (symbol_conf.use_callchain && he->leaf) {
+                       if (callchain_param.mode == CHAIN_GRAPH_REL)
+                               total = symbol_conf.cumulate_callchain ?
+                                       he->stat_acc->period : he->stat.period;
+
+                       perf_gtk__add_callchain(&he->sorted_chain, store, &iter,
+                                               col_idx, total);
+               }
+       }
+
+}
+
+static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
+                                    float min_pcnt)
+{
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       GType col_types[MAX_COLUMNS];
+       GtkCellRenderer *renderer;
+       GtkTreeStore *store;
+       GtkWidget *view;
+       int col_idx;
+       int nr_cols = 0;
+       char s[512];
+       char buf[512];
+       bool first_node, first_col;
+       struct perf_hpp hpp = {
+               .buf            = s,
+               .size           = sizeof(s),
+       };
+
+       hists__for_each_format(hists, fmt) {
+               if (perf_hpp__is_sort_entry(fmt) ||
+                   perf_hpp__is_dynamic_entry(fmt))
+                       break;
+
+               col_types[nr_cols++] = G_TYPE_STRING;
+       }
+       col_types[nr_cols++] = G_TYPE_STRING;
+
+       store = gtk_tree_store_newv(nr_cols, col_types);
+       view = gtk_tree_view_new();
+       renderer = gtk_cell_renderer_text_new();
+
+       col_idx = 0;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+                                                           -1, fmt->name,
+                                                           renderer, "markup",
+                                                           col_idx++, NULL);
+       }
+
+       /* construct merged column header since sort keys share single column */
+       buf[0] = '\0';
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node)
+                       strcat(buf, " / ");
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp ,fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               strcat(buf, "+");
+                       first_col = false;
+
+                       fmt->header(fmt, &hpp, hists_to_evsel(hists));
+                       strcat(buf, ltrim(rtrim(hpp.buf)));
+               }
+       }
+
+       gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+                                                   -1, buf,
+                                                   renderer, "markup",
+                                                   col_idx++, NULL);
+
+       for (col_idx = 0; col_idx < nr_cols; col_idx++) {
+               GtkTreeViewColumn *column;
+
+               column = gtk_tree_view_get_column(GTK_TREE_VIEW(view), col_idx);
+               gtk_tree_view_column_set_resizable(column, TRUE);
+
+               if (col_idx == 0) {
+                       gtk_tree_view_set_expander_column(GTK_TREE_VIEW(view),
+                                                         column);
+               }
+       }
+
+       gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+       g_object_unref(GTK_TREE_MODEL(store));
+
+       perf_gtk__add_hierarchy_entries(hists, &hists->entries, store,
+                                       NULL, &hpp, min_pcnt);
+
+       gtk_tree_view_set_rules_hint(GTK_TREE_VIEW(view), TRUE);
+
+       g_signal_connect(view, "row-activated",
+                        G_CALLBACK(on_row_activated), NULL);
+       gtk_container_add(GTK_CONTAINER(window), view);
+}
+
  int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
                                   const char *help,
                                   struct hist_browser_timer *hbt __maybe_unused,
@@ -463,7 +651,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
                                                         GTK_POLICY_AUTOMATIC,
                                                         GTK_POLICY_AUTOMATIC);
  
-               perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
+               if (symbol_conf.report_hierarchy)
+                       perf_gtk__show_hierarchy(scrolled_window, hists, min_pcnt);
+               else
+                       perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
  
                 tab_label = gtk_label_new(evname);
  
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c

index bf2a66e254eac35c63a0dd44e83628f34e4dc2f9..3baeaa6e71b5a51e113b8b485df97b7c8ae003a2 100644 (file)
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -5,6 +5,7 @@
  #include "../util/util.h"
  #include "../util/sort.h"
  #include "../util/evsel.h"
+#include "../util/evlist.h"
  
  /* hist period print (hpp) functions */
  
@@ -371,7 +372,20 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
         return 0;
  }
  
-#define HPP__COLOR_PRINT_FNS(_name, _fn)               \
+static bool perf_hpp__is_hpp_entry(struct perf_hpp_fmt *a)
+{
+       return a->header == hpp__header_fn;
+}
+
+static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       if (!perf_hpp__is_hpp_entry(a) || !perf_hpp__is_hpp_entry(b))
+               return false;
+
+       return a->idx == b->idx;
+}
+
+#define HPP__COLOR_PRINT_FNS(_name, _fn, _idx)         \
         {                                               \
                 .name   = _name,                        \
                 .header = hpp__header_fn,               \
@@ -381,9 +395,11 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                 .cmp    = hpp__nop_cmp,                 \
                 .collapse = hpp__nop_cmp,               \
                 .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
         }
  
-#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn)           \
+#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn, _idx)     \
         {                                               \
                 .name   = _name,                        \
                 .header = hpp__header_fn,               \
@@ -393,9 +409,11 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                 .cmp    = hpp__nop_cmp,                 \
                 .collapse = hpp__nop_cmp,               \
                 .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
         }
  
-#define HPP__PRINT_FNS(_name, _fn)                     \
+#define HPP__PRINT_FNS(_name, _fn, _idx)               \
         {                                               \
                 .name   = _name,                        \
                 .header = hpp__header_fn,               \
@@ -404,22 +422,25 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                 .cmp    = hpp__nop_cmp,                 \
                 .collapse = hpp__nop_cmp,               \
                 .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
         }
  
  struct perf_hpp_fmt perf_hpp__format[] = {
-       HPP__COLOR_PRINT_FNS("Overhead", overhead),
-       HPP__COLOR_PRINT_FNS("sys", overhead_sys),
-       HPP__COLOR_PRINT_FNS("usr", overhead_us),
-       HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys),
-       HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us),
-       HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc),
-       HPP__PRINT_FNS("Samples", samples),
-       HPP__PRINT_FNS("Period", period)
+       HPP__COLOR_PRINT_FNS("Overhead", overhead, OVERHEAD),
+       HPP__COLOR_PRINT_FNS("sys", overhead_sys, OVERHEAD_SYS),
+       HPP__COLOR_PRINT_FNS("usr", overhead_us, OVERHEAD_US),
+       HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys, OVERHEAD_GUEST_SYS),
+       HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us, OVERHEAD_GUEST_US),
+       HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc, OVERHEAD_ACC),
+       HPP__PRINT_FNS("Samples", samples, SAMPLES),
+       HPP__PRINT_FNS("Period", period, PERIOD)
  };
  
-LIST_HEAD(perf_hpp__list);
-LIST_HEAD(perf_hpp__sort_list);
-
+struct perf_hpp_list perf_hpp_list = {
+       .fields = LIST_HEAD_INIT(perf_hpp_list.fields),
+       .sorts  = LIST_HEAD_INIT(perf_hpp_list.sorts),
+};
  
  #undef HPP__COLOR_PRINT_FNS
  #undef HPP__COLOR_ACC_PRINT_FNS
@@ -485,63 +506,60 @@ void perf_hpp__init(void)
                 hpp_dimension__add_output(PERF_HPP__PERIOD);
  }
  
-void perf_hpp__column_register(struct perf_hpp_fmt *format)
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+                                   struct perf_hpp_fmt *format)
  {
-       list_add_tail(&format->list, &perf_hpp__list);
+       list_add_tail(&format->list, &list->fields);
  }
  
-void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+                                       struct perf_hpp_fmt *format)
  {
-       list_del(&format->list);
+       list_add_tail(&format->sort_list, &list->sorts);
  }
  
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
-{
-       list_add_tail(&format->sort_list, &perf_hpp__sort_list);
-}
-
-void perf_hpp__column_enable(unsigned col)
-{
-       BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       perf_hpp__column_register(&perf_hpp__format[col]);
-}
-
-void perf_hpp__column_disable(unsigned col)
+void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
  {
-       BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       perf_hpp__column_unregister(&perf_hpp__format[col]);
+       list_del(&format->list);
  }
  
  void perf_hpp__cancel_cumulate(void)
  {
+       struct perf_hpp_fmt *fmt, *acc, *ovh, *tmp;
+
         if (is_strict_order(field_order))
                 return;
  
-       perf_hpp__column_disable(PERF_HPP__OVERHEAD_ACC);
-       perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
+       ovh = &perf_hpp__format[PERF_HPP__OVERHEAD];
+       acc = &perf_hpp__format[PERF_HPP__OVERHEAD_ACC];
+
+       perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+               if (acc->equal(acc, fmt)) {
+                       perf_hpp__column_unregister(fmt);
+                       continue;
+               }
+
+               if (ovh->equal(ovh, fmt))
+                       fmt->name = "Overhead";
+       }
  }
  
-void perf_hpp__setup_output_field(void)
+static bool fmt_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       return a->equal && a->equal(a, b);
+}
+
+void perf_hpp__setup_output_field(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt;
  
         /* append sort keys to output field */
-       perf_hpp__for_each_sort_list(fmt) {
-               if (!list_empty(&fmt->list))
-                       continue;
-
-               /*
-                * sort entry fields are dynamically created,
-                * so they can share a same sort key even though
-                * the list is empty.
-                */
-               if (perf_hpp__is_sort_entry(fmt)) {
-                       struct perf_hpp_fmt *pos;
+       perf_hpp_list__for_each_sort_list(list, fmt) {
+               struct perf_hpp_fmt *pos;
  
-                       perf_hpp__for_each_format(pos) {
-                               if (perf_hpp__same_sort_entry(pos, fmt))
-                                       goto next;
-                       }
+               perf_hpp_list__for_each_format(list, pos) {
+                       if (fmt_equal(fmt, pos))
+                               goto next;
                 }
  
                 perf_hpp__column_register(fmt);
@@ -550,27 +568,17 @@ next:
         }
  }
  
-void perf_hpp__append_sort_keys(void)
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt;
  
         /* append output fields to sort keys */
-       perf_hpp__for_each_format(fmt) {
-               if (!list_empty(&fmt->sort_list))
-                       continue;
-
-               /*
-                * sort entry fields are dynamically created,
-                * so they can share a same sort key even though
-                * the list is empty.
-                */
-               if (perf_hpp__is_sort_entry(fmt)) {
-                       struct perf_hpp_fmt *pos;
+       perf_hpp_list__for_each_format(list, fmt) {
+               struct perf_hpp_fmt *pos;
  
-                       perf_hpp__for_each_sort_list(pos) {
-                               if (perf_hpp__same_sort_entry(pos, fmt))
-                                       goto next;
-                       }
+               perf_hpp_list__for_each_sort_list(list, pos) {
+                       if (fmt_equal(fmt, pos))
+                               goto next;
                 }
  
                 perf_hpp__register_sort_field(fmt);
@@ -579,20 +587,29 @@ next:
         }
  }
  
-void perf_hpp__reset_output_field(void)
+
+static void fmt_free(struct perf_hpp_fmt *fmt)
+{
+       if (fmt->free)
+               fmt->free(fmt);
+}
+
+void perf_hpp__reset_output_field(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt, *tmp;
  
         /* reset output fields */
-       perf_hpp__for_each_format_safe(fmt, tmp) {
+       perf_hpp_list__for_each_format_safe(list, fmt, tmp) {
                 list_del_init(&fmt->list);
                 list_del_init(&fmt->sort_list);
+               fmt_free(fmt);
         }
  
         /* reset sort keys */
-       perf_hpp__for_each_sort_list_safe(fmt, tmp) {
+       perf_hpp_list__for_each_sort_list_safe(list, fmt, tmp) {
                 list_del_init(&fmt->list);
                 list_del_init(&fmt->sort_list);
+               fmt_free(fmt);
         }
  }
  
@@ -606,7 +623,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
         bool first = true;
         struct perf_hpp dummy_hpp;
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists))
                         continue;
  
@@ -624,22 +641,39 @@ unsigned int hists__sort_list_width(struct hists *hists)
         return ret;
  }
  
-void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+unsigned int hists__overhead_width(struct hists *hists)
  {
-       int idx;
-
-       if (perf_hpp__is_sort_entry(fmt))
-               return perf_hpp__reset_sort_width(fmt, hists);
+       struct perf_hpp_fmt *fmt;
+       int ret = 0;
+       bool first = true;
+       struct perf_hpp dummy_hpp;
  
-       for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-               if (fmt == &perf_hpp__format[idx])
+       hists__for_each_format(hists, fmt) {
+               if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
                         break;
+
+               if (first)
+                       first = false;
+               else
+                       ret += 2;
+
+               ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists));
         }
  
-       if (idx == PERF_HPP__MAX_INDEX)
+       return ret;
+}
+
+void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+{
+       if (perf_hpp__is_sort_entry(fmt))
+               return perf_hpp__reset_sort_width(fmt, hists);
+
+       if (perf_hpp__is_dynamic_entry(fmt))
                 return;
  
-       switch (idx) {
+       BUG_ON(fmt->idx >= PERF_HPP__MAX_INDEX);
+
+       switch (fmt->idx) {
         case PERF_HPP__OVERHEAD:
         case PERF_HPP__OVERHEAD_SYS:
         case PERF_HPP__OVERHEAD_US:
@@ -667,7 +701,7 @@ void perf_hpp__set_user_width(const char *width_list_str)
         struct perf_hpp_fmt *fmt;
         const char *ptr = width_list_str;
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 char *p;
  
                 int len = strtol(ptr, &p, 10);
@@ -679,3 +713,71 @@ void perf_hpp__set_user_width(const char *width_list_str)
                         break;
         }
  }
+
+static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
+{
+       struct perf_hpp_list_node *node = NULL;
+       struct perf_hpp_fmt *fmt_copy;
+       bool found = false;
+       bool skip = perf_hpp__should_skip(fmt, hists);
+
+       list_for_each_entry(node, &hists->hpp_formats, list) {
+               if (node->level == fmt->level) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               node = malloc(sizeof(*node));
+               if (node == NULL)
+                       return -1;
+
+               node->skip = skip;
+               node->level = fmt->level;
+               perf_hpp_list__init(&node->hpp);
+
+               hists->nr_hpp_node++;
+               list_add_tail(&node->list, &hists->hpp_formats);
+       }
+
+       fmt_copy = perf_hpp_fmt__dup(fmt);
+       if (fmt_copy == NULL)
+               return -1;
+
+       if (!skip)
+               node->skip = false;
+
+       list_add_tail(&fmt_copy->list, &node->hpp.fields);
+       list_add_tail(&fmt_copy->sort_list, &node->hpp.sorts);
+
+       return 0;
+}
+
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+                                 struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+       struct perf_hpp_fmt *fmt;
+       struct hists *hists;
+       int ret;
+
+       if (!symbol_conf.report_hierarchy)
+               return 0;
+
+       evlist__for_each(evlist, evsel) {
+               hists = evsel__hists(evsel);
+
+               perf_hpp_list__for_each_sort_list(list, fmt) {
+                       if (perf_hpp__is_dynamic_entry(fmt) &&
+                           !perf_hpp__defined_dynamic_entry(fmt, hists))
+                               continue;
+
+                       ret = add_hierarchy_fmt(hists, fmt);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c

index 387110d50b002557d99e723605407c3db990d71a..7aff5acf3265782e03254de2d8bc1dfafb56e03a 100644 (file)
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -165,8 +165,28 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
         return ret;
  }
  
+/*
+ * If have one single callchain root, don't bother printing
+ * its percentage (100 % in fractal mode and the same percentage
+ * than the hist in graph mode). This also avoid one level of column.
+ *
+ * However when percent-limit applied, it's possible that single callchain
+ * node have different (non-100% in fractal mode) percentage.
+ */
+static bool need_percent_display(struct rb_node *node, u64 parent_samples)
+{
+       struct callchain_node *cnode;
+
+       if (rb_next(node))
+               return true;
+
+       cnode = rb_entry(node, struct callchain_node, rb_node);
+       return callchain_cumul_hits(cnode) != parent_samples;
+}
+
  static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
-                                      u64 total_samples, int left_margin)
+                                      u64 total_samples, u64 parent_samples,
+                                      int left_margin)
  {
         struct callchain_node *cnode;
         struct callchain_list *chain;
@@ -177,13 +197,8 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
         int ret = 0;
         char bf[1024];
  
-       /*
-        * If have one single callchain root, don't bother printing
-        * its percentage (100 % in fractal mode and the same percentage
-        * than the hist in graph mode). This also avoid one level of column.
-        */
         node = rb_first(root);
-       if (node && !rb_next(node)) {
+       if (node && !need_percent_display(node, parent_samples)) {
                 cnode = rb_entry(node, struct callchain_node, rb_node);
                 list_for_each_entry(chain, &cnode->val, list) {
                         /*
@@ -213,9 +228,15 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
                 root = &cnode->rb_root;
         }
  
+       if (callchain_param.mode == CHAIN_GRAPH_REL)
+               total_samples = parent_samples;
+
         ret += __callchain__fprintf_graph(fp, root, total_samples,
                                           1, 1, left_margin);
-       ret += fprintf(fp, "\n");
+       if (ret) {
+               /* do not add a blank line if it printed nothing */
+               ret += fprintf(fp, "\n");
+       }
  
         return ret;
  }
@@ -323,16 +344,19 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
                                             u64 total_samples, int left_margin,
                                             FILE *fp)
  {
+       u64 parent_samples = he->stat.period;
+
+       if (symbol_conf.cumulate_callchain)
+               parent_samples = he->stat_acc->period;
+
         switch (callchain_param.mode) {
         case CHAIN_GRAPH_REL:
-               return callchain__fprintf_graph(fp, &he->sorted_chain,
-                                               symbol_conf.cumulate_callchain ?
-                                               he->stat_acc->period : he->stat.period,
-                                               left_margin);
+               return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
+                                               parent_samples, left_margin);
                 break;
         case CHAIN_GRAPH_ABS:
                 return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
-                                               left_margin);
+                                               parent_samples, left_margin);
                 break;
         case CHAIN_FLAT:
                 return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
@@ -349,45 +373,66 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
         return 0;
  }
  
-static size_t hist_entry__callchain_fprintf(struct hist_entry *he,
-                                           struct hists *hists,
-                                           FILE *fp)
+static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
  {
-       int left_margin = 0;
-       u64 total_period = hists->stats.total_period;
+       const char *sep = symbol_conf.field_sep;
+       struct perf_hpp_fmt *fmt;
+       char *start = hpp->buf;
+       int ret;
+       bool first = true;
  
-       if (field_order == NULL && (sort_order == NULL ||
-                                   !prefixcmp(sort_order, "comm"))) {
-               struct perf_hpp_fmt *fmt;
+       if (symbol_conf.exclude_other && !he->parent)
+               return 0;
  
-               perf_hpp__for_each_format(fmt) {
-                       if (!perf_hpp__is_sort_entry(fmt))
-                               continue;
+       hists__for_each_format(he->hists, fmt) {
+               if (perf_hpp__should_skip(fmt, he->hists))
+                       continue;
  
-                       /* must be 'comm' sort entry */
-                       left_margin = fmt->width(fmt, NULL, hists_to_evsel(hists));
-                       left_margin -= thread__comm_len(he->thread);
-                       break;
-               }
+               /*
+                * If there's no field_sep, we still need
+                * to display initial '  '.
+                */
+               if (!sep || !first) {
+                       ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: "  ");
+                       advance_hpp(hpp, ret);
+               } else
+                       first = false;
+
+               if (perf_hpp__use_color() && fmt->color)
+                       ret = fmt->color(fmt, hpp, he);
+               else
+                       ret = fmt->entry(fmt, hpp, he);
+
+               ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
+               advance_hpp(hpp, ret);
         }
-       return hist_entry_callchain__fprintf(he, total_period, left_margin, fp);
+
+       return hpp->buf - start;
  }
  
-static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
+static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
+                                        struct perf_hpp *hpp,
+                                        struct hists *hists,
+                                        FILE *fp)
  {
         const char *sep = symbol_conf.field_sep;
         struct perf_hpp_fmt *fmt;
-       char *start = hpp->buf;
-       int ret;
+       struct perf_hpp_list_node *fmt_node;
+       char *buf = hpp->buf;
+       size_t size = hpp->size;
+       int ret, printed = 0;
         bool first = true;
  
         if (symbol_conf.exclude_other && !he->parent)
                 return 0;
  
-       perf_hpp__for_each_format(fmt) {
-               if (perf_hpp__should_skip(fmt, he->hists))
-                       continue;
+       ret = scnprintf(hpp->buf, hpp->size, "%*s", he->depth * HIERARCHY_INDENT, "");
+       advance_hpp(hpp, ret);
  
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
                 /*
                  * If there's no field_sep, we still need
                  * to display initial '  '.
@@ -403,10 +448,47 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
                 else
                         ret = fmt->entry(fmt, hpp, he);
  
+               ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
                 advance_hpp(hpp, ret);
         }
  
-       return hpp->buf - start;
+       if (!sep)
+               ret = scnprintf(hpp->buf, hpp->size, "%*s",
+                               (hists->nr_hpp_node - 2) * HIERARCHY_INDENT, "");
+       advance_hpp(hpp, ret);
+
+       printed += fprintf(fp, "%s", buf);
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               hpp->buf  = buf;
+               hpp->size = size;
+
+               /*
+                * No need to call hist_entry__snprintf_alignment() since this
+                * fmt is always the last column in the hierarchy mode.
+                */
+               if (perf_hpp__use_color() && fmt->color)
+                       fmt->color(fmt, hpp, he);
+               else
+                       fmt->entry(fmt, hpp, he);
+
+               /*
+                * dynamic entries are right-aligned but we want left-aligned
+                * in the hierarchy mode
+                */
+               printed += fprintf(fp, "%s%s", sep ?: "  ", ltrim(buf));
+       }
+       printed += putc('\n', fp);
+
+       if (symbol_conf.use_callchain && he->leaf) {
+               u64 total = hists__total_period(hists);
+
+               printed += hist_entry_callchain__fprintf(he, total, 0, fp);
+               goto out;
+       }
+
+out:
+       return printed;
  }
  
  static int hist_entry__fprintf(struct hist_entry *he, size_t size,
@@ -418,24 +500,134 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
                 .buf            = bf,
                 .size           = size,
         };
+       u64 total_period = hists->stats.total_period;
  
         if (size == 0 || size > bfsz)
                 size = hpp.size = bfsz;
  
+       if (symbol_conf.report_hierarchy)
+               return hist_entry__hierarchy_fprintf(he, &hpp, hists, fp);
+
         hist_entry__snprintf(he, &hpp);
  
         ret = fprintf(fp, "%s\n", bf);
  
         if (symbol_conf.use_callchain)
-               ret += hist_entry__callchain_fprintf(he, hists, fp);
+               ret += hist_entry_callchain__fprintf(he, total_period, 0, fp);
  
         return ret;
  }
  
+static int print_hierarchy_indent(const char *sep, int indent,
+                                 const char *line, FILE *fp)
+{
+       if (sep != NULL || indent < 2)
+               return 0;
+
+       return fprintf(fp, "%-.*s", (indent - 2) * HIERARCHY_INDENT, line);
+}
+
+static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
+                                 const char *sep, FILE *fp)
+{
+       bool first_node, first_col;
+       int indent;
+       int depth;
+       unsigned width = 0;
+       unsigned header_width = 0;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+
+       indent = hists->nr_hpp_node;
+
+       /* preserve max indent depth for column headers */
+       print_hierarchy_indent(sep, indent, spaces, fp);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               fmt->header(fmt, hpp, hists_to_evsel(hists));
+               fprintf(fp, "%s%s", hpp->buf, sep ?: "  ");
+       }
+
+       /* combine sort headers with ' / ' */
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node)
+                       header_width += fprintf(fp, " / ");
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               header_width += fprintf(fp, "+");
+                       first_col = false;
+
+                       fmt->header(fmt, hpp, hists_to_evsel(hists));
+                       rtrim(hpp->buf);
+
+                       header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+               }
+       }
+
+       fprintf(fp, "\n# ");
+
+       /* preserve max indent depth for initial dots */
+       print_hierarchy_indent(sep, indent, dots, fp);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+
+       first_col = true;
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (!first_col)
+                       fprintf(fp, "%s", sep ?: "..");
+               first_col = false;
+
+               width = fmt->width(fmt, hpp, hists_to_evsel(hists));
+               fprintf(fp, "%.*s", width, dots);
+       }
+
+       depth = 0;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               first_col = true;
+               width = depth * HIERARCHY_INDENT;
+
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               width++;  /* for '+' sign between column header */
+                       first_col = false;
+
+                       width += fmt->width(fmt, hpp, hists_to_evsel(hists));
+               }
+
+               if (width > header_width)
+                       header_width = width;
+
+               depth++;
+       }
+
+       fprintf(fp, "%s%-.*s", sep ?: "  ", header_width, dots);
+
+       fprintf(fp, "\n#\n");
+
+       return 2;
+}
+
  size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
                       int max_cols, float min_pcnt, FILE *fp)
  {
         struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
         struct rb_node *nd;
         size_t ret = 0;
         unsigned int width;
@@ -449,10 +641,11 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
         bool first = true;
         size_t linesz;
         char *line = NULL;
+       unsigned indent;
  
         init_rem_hits();
  
-       perf_hpp__for_each_format(fmt)
+       hists__for_each_format(hists, fmt)
                 perf_hpp__reset_width(fmt, hists);
  
         if (symbol_conf.col_width_list_str)
@@ -463,7 +656,16 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
  
         fprintf(fp, "# ");
  
-       perf_hpp__for_each_format(fmt) {
+       if (symbol_conf.report_hierarchy) {
+               list_for_each_entry(fmt_node, &hists->hpp_formats, list) {
+                       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt)
+                               perf_hpp__reset_width(fmt, hists);
+               }
+               nr_rows += print_hierarchy_header(hists, &dummy_hpp, sep, fp);
+               goto print_entries;
+       }
+
+       hists__for_each_format(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists))
                         continue;
  
@@ -487,7 +689,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
  
         fprintf(fp, "# ");
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                 unsigned int i;
  
                 if (perf_hpp__should_skip(fmt, hists))
@@ -520,7 +722,9 @@ print_entries:
                 goto out;
         }
  
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+       indent = hists__overhead_width(hists) + 4;
+
+       for (nd = rb_first(&hists->entries); nd; nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD)) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
                 float percent;
  
@@ -536,6 +740,20 @@ print_entries:
                 if (max_rows && ++nr_rows >= max_rows)
                         break;
  
+               /*
+                * If all children are filtered out or percent-limited,
+                * display "no entry >= x.xx%" message.
+                */
+               if (!h->leaf && !hist_entry__has_hierarchy_children(h, min_pcnt)) {
+                       int depth = hists->nr_hpp_node + h->depth + 1;
+
+                       print_hierarchy_indent(sep, depth, spaces, fp);
+                       fprintf(fp, "%*sno entry >= %.2f%%\n", indent, "", min_pcnt);
+
+                       if (max_rows && ++nr_rows >= max_rows)
+                               break;
+               }
+
                 if (h->ms.map == NULL && verbose > 1) {
                         __map_groups__fprintf_maps(h->thread->mg,
                                                    MAP__FUNCTION, fp);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build

index 5eec53a3f4ac7dbedb54776f746705acfa4fa2b4..eea25e2424e99172715c681a05ffc68b3c58e78a 100644 (file)
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -82,6 +82,7 @@ libperf-y += parse-branch-options.o
  libperf-y += parse-regs-options.o
  libperf-y += term.o
  libperf-y += help-unknown-cmd.o
+libperf-y += mem-events.o
  
  libperf-$(CONFIG_LIBBPF) += bpf-loader.o
  libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -105,8 +106,17 @@ libperf-y += scripting-engines/
  
  libperf-$(CONFIG_ZLIB) += zlib.o
  libperf-$(CONFIG_LZMA) += lzma.o
+libperf-y += demangle-java.o
+
+ifdef CONFIG_JITDUMP
+libperf-$(CONFIG_LIBELF) += jitdump.o
+libperf-$(CONFIG_LIBELF) += genelf.o
+libperf-$(CONFIG_LIBELF) += genelf_debug.o
+endif
  
  CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+# avoid compiler warnings in 32-bit mode
+CFLAGS_genelf_debug.o  += -Wno-packed
  
  $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
         $(call rule_mkdir)
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c

index 360fda01f3b0d17369254d03fe16daf24f30c698..ec164fe70718df1480b02733d8701c7ab2b74297 100644 (file)
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -478,10 +478,11 @@ void auxtrace_heap__pop(struct auxtrace_heap *heap)
                          heap_array[last].ordinal);
  }
  
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist)
  {
         if (itr)
-               return itr->info_priv_size(itr);
+               return itr->info_priv_size(itr, evlist);
         return 0;
  }
  
@@ -852,7 +853,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
         int err;
  
         pr_debug2("Synthesizing auxtrace information\n");
-       priv_size = auxtrace_record__info_priv_size(itr);
+       priv_size = auxtrace_record__info_priv_size(itr, session->evlist);
         ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
         if (!ev)
                 return -ENOMEM;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h

index b86f90db1352a6c8635e3ea5d02aa3c21bccc323..e5a8e2d4f2af4e9b717be7ce27c587f569983027 100644 (file)
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -293,7 +293,8 @@ struct auxtrace_record {
         int (*recording_options)(struct auxtrace_record *itr,
                                  struct perf_evlist *evlist,
                                  struct record_opts *opts);
-       size_t (*info_priv_size)(struct auxtrace_record *itr);
+       size_t (*info_priv_size)(struct auxtrace_record *itr,
+                                struct perf_evlist *evlist);
         int (*info_fill)(struct auxtrace_record *itr,
                          struct perf_session *session,
                          struct auxtrace_info_event *auxtrace_info,
@@ -429,7 +430,8 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
  int auxtrace_record__options(struct auxtrace_record *itr,
                              struct perf_evlist *evlist,
                              struct record_opts *opts);
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist);
  int auxtrace_record__info_fill(struct auxtrace_record *itr,
                                struct perf_session *session,
                                struct auxtrace_info_event *auxtrace_info,
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c

index 540a7efa657ea8668eeddf1b476bbcb3c0490bbc..0967ce601931685ed294827e8aef7c30c47736c6 100644 (file)
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -7,6 +7,7 @@
  
  #include <linux/bpf.h>
  #include <bpf/libbpf.h>
+#include <bpf/bpf.h>
  #include <linux/err.h>
  #include <linux/string.h>
  #include "perf.h"
@@ -16,6 +17,7 @@
  #include "llvm-utils.h"
  #include "probe-event.h"
  #include "probe-finder.h" // for MAX_PROBES
+#include "parse-events.h"
  #include "llvm-utils.h"
  
  #define DEFINE_PRINT_FN(name, level) \
@@ -108,8 +110,8 @@ void bpf__clear(void)
  }
  
  static void
-bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
-                    void *_priv)
+clear_prog_priv(struct bpf_program *prog __maybe_unused,
+               void *_priv)
  {
         struct bpf_prog_priv *priv = _priv;
  
@@ -337,7 +339,7 @@ config_bpf_program(struct bpf_program *prog)
         }
         pr_debug("bpf: config '%s' is ok\n", config_str);
  
-       err = bpf_program__set_private(prog, priv, bpf_prog_priv__clear);
+       err = bpf_program__set_private(prog, priv, clear_prog_priv);
         if (err) {
                 pr_debug("Failed to set priv for program '%s'\n", config_str);
                 goto errout;
@@ -739,6 +741,682 @@ int bpf__foreach_tev(struct bpf_object *obj,
         return 0;
  }
  
+enum bpf_map_op_type {
+       BPF_MAP_OP_SET_VALUE,
+       BPF_MAP_OP_SET_EVSEL,
+};
+
+enum bpf_map_key_type {
+       BPF_MAP_KEY_ALL,
+       BPF_MAP_KEY_RANGES,
+};
+
+struct bpf_map_op {
+       struct list_head list;
+       enum bpf_map_op_type op_type;
+       enum bpf_map_key_type key_type;
+       union {
+               struct parse_events_array array;
+       } k;
+       union {
+               u64 value;
+               struct perf_evsel *evsel;
+       } v;
+};
+
+struct bpf_map_priv {
+       struct list_head ops_list;
+};
+
+static void
+bpf_map_op__delete(struct bpf_map_op *op)
+{
+       if (!list_empty(&op->list))
+               list_del(&op->list);
+       if (op->key_type == BPF_MAP_KEY_RANGES)
+               parse_events__clear_array(&op->k.array);
+       free(op);
+}
+
+static void
+bpf_map_priv__purge(struct bpf_map_priv *priv)
+{
+       struct bpf_map_op *pos, *n;
+
+       list_for_each_entry_safe(pos, n, &priv->ops_list, list) {
+               list_del_init(&pos->list);
+               bpf_map_op__delete(pos);
+       }
+}
+
+static void
+bpf_map_priv__clear(struct bpf_map *map __maybe_unused,
+                   void *_priv)
+{
+       struct bpf_map_priv *priv = _priv;
+
+       bpf_map_priv__purge(priv);
+       free(priv);
+}
+
+static int
+bpf_map_op_setkey(struct bpf_map_op *op, struct parse_events_term *term)
+{
+       op->key_type = BPF_MAP_KEY_ALL;
+       if (!term)
+               return 0;
+
+       if (term->array.nr_ranges) {
+               size_t memsz = term->array.nr_ranges *
+                               sizeof(op->k.array.ranges[0]);
+
+               op->k.array.ranges = memdup(term->array.ranges, memsz);
+               if (!op->k.array.ranges) {
+                       pr_debug("No enough memory to alloc indices for map\n");
+                       return -ENOMEM;
+               }
+               op->key_type = BPF_MAP_KEY_RANGES;
+               op->k.array.nr_ranges = term->array.nr_ranges;
+       }
+       return 0;
+}
+
+static struct bpf_map_op *
+bpf_map_op__new(struct parse_events_term *term)
+{
+       struct bpf_map_op *op;
+       int err;
+
+       op = zalloc(sizeof(*op));
+       if (!op) {
+               pr_debug("Failed to alloc bpf_map_op\n");
+               return ERR_PTR(-ENOMEM);
+       }
+       INIT_LIST_HEAD(&op->list);
+
+       err = bpf_map_op_setkey(op, term);
+       if (err) {
+               free(op);
+               return ERR_PTR(err);
+       }
+       return op;
+}
+
+static int
+bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
+{
+       struct bpf_map_priv *priv;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+       err = bpf_map__get_private(map, (void **)&priv);
+       if (err) {
+               pr_debug("Failed to get private from map %s\n", map_name);
+               return err;
+       }
+
+       if (!priv) {
+               priv = zalloc(sizeof(*priv));
+               if (!priv) {
+                       pr_debug("No enough memory to alloc map private\n");
+                       return -ENOMEM;
+               }
+               INIT_LIST_HEAD(&priv->ops_list);
+
+               if (bpf_map__set_private(map, priv, bpf_map_priv__clear)) {
+                       free(priv);
+                       return -BPF_LOADER_ERRNO__INTERNAL;
+               }
+       }
+
+       list_add_tail(&op->list, &priv->ops_list);
+       return 0;
+}
+
+static struct bpf_map_op *
+bpf_map__add_newop(struct bpf_map *map, struct parse_events_term *term)
+{
+       struct bpf_map_op *op;
+       int err;
+
+       op = bpf_map_op__new(term);
+       if (IS_ERR(op))
+               return op;
+
+       err = bpf_map__add_op(map, op);
+       if (err) {
+               bpf_map_op__delete(op);
+               return ERR_PTR(err);
+       }
+       return op;
+}
+
+static int
+__bpf_map__config_value(struct bpf_map *map,
+                       struct parse_events_term *term)
+{
+       struct bpf_map_def def;
+       struct bpf_map_op *op;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("Unable to get map definition from '%s'\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       if (def.type != BPF_MAP_TYPE_ARRAY) {
+               pr_debug("Map %s type is not BPF_MAP_TYPE_ARRAY\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+       }
+       if (def.key_size < sizeof(unsigned int)) {
+               pr_debug("Map %s has incorrect key size\n", map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE;
+       }
+       switch (def.value_size) {
+       case 1:
+       case 2:
+       case 4:
+       case 8:
+               break;
+       default:
+               pr_debug("Map %s has incorrect value size\n", map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+       }
+
+       op = bpf_map__add_newop(map, term);
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       op->op_type = BPF_MAP_OP_SET_VALUE;
+       op->v.value = term->val.num;
+       return 0;
+}
+
+static int
+bpf_map__config_value(struct bpf_map *map,
+                     struct parse_events_term *term,
+                     struct perf_evlist *evlist __maybe_unused)
+{
+       if (!term->err_val) {
+               pr_debug("Config value not set\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+       }
+
+       if (term->type_val != PARSE_EVENTS__TERM_TYPE_NUM) {
+               pr_debug("ERROR: wrong value type for 'value'\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+       }
+
+       return __bpf_map__config_value(map, term);
+}
+
+static int
+__bpf_map__config_event(struct bpf_map *map,
+                       struct parse_events_term *term,
+                       struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+       struct bpf_map_def def;
+       struct bpf_map_op *op;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+       evsel = perf_evlist__find_evsel_by_str(evlist, term->val.str);
+       if (!evsel) {
+               pr_debug("Event (for '%s') '%s' doesn't exist\n",
+                        map_name, term->val.str);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("Unable to get map definition from '%s'\n",
+                        map_name);
+               return err;
+       }
+
+       /*
+        * No need to check key_size and value_size:
+        * kernel has already checked them.
+        */
+       if (def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+               pr_debug("Map %s type is not BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+       }
+
+       op = bpf_map__add_newop(map, term);
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       op->op_type = BPF_MAP_OP_SET_EVSEL;
+       op->v.evsel = evsel;
+       return 0;
+}
+
+static int
+bpf_map__config_event(struct bpf_map *map,
+                     struct parse_events_term *term,
+                     struct perf_evlist *evlist)
+{
+       if (!term->err_val) {
+               pr_debug("Config value not set\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+       }
+
+       if (term->type_val != PARSE_EVENTS__TERM_TYPE_STR) {
+               pr_debug("ERROR: wrong value type for 'event'\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+       }
+
+       return __bpf_map__config_event(map, term, evlist);
+}
+
+struct bpf_obj_config__map_func {
+       const char *config_opt;
+       int (*config_func)(struct bpf_map *, struct parse_events_term *,
+                          struct perf_evlist *);
+};
+
+struct bpf_obj_config__map_func bpf_obj_config__map_funcs[] = {
+       {"value", bpf_map__config_value},
+       {"event", bpf_map__config_event},
+};
+
+static int
+config_map_indices_range_check(struct parse_events_term *term,
+                              struct bpf_map *map,
+                              const char *map_name)
+{
+       struct parse_events_array *array = &term->array;
+       struct bpf_map_def def;
+       unsigned int i;
+       int err;
+
+       if (!array->nr_ranges)
+               return 0;
+       if (!array->ranges) {
+               pr_debug("ERROR: map %s: array->nr_ranges is %d but range array is NULL\n",
+                        map_name, (int)array->nr_ranges);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("ERROR: Unable to get map definition from '%s'\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       for (i = 0; i < array->nr_ranges; i++) {
+               unsigned int start = array->ranges[i].start;
+               size_t length = array->ranges[i].length;
+               unsigned int idx = start + length - 1;
+
+               if (idx >= def.max_entries) {
+                       pr_debug("ERROR: index %d too large\n", idx);
+                       return -BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG;
+               }
+       }
+       return 0;
+}
+
+static int
+bpf__obj_config_map(struct bpf_object *obj,
+                   struct parse_events_term *term,
+                   struct perf_evlist *evlist,
+                   int *key_scan_pos)
+{
+       /* key is "map:<mapname>.<config opt>" */
+       char *map_name = strdup(term->config + sizeof("map:") - 1);
+       struct bpf_map *map;
+       int err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+       char *map_opt;
+       size_t i;
+
+       if (!map_name)
+               return -ENOMEM;
+
+       map_opt = strchr(map_name, '.');
+       if (!map_opt) {
+               pr_debug("ERROR: Invalid map config: %s\n", map_name);
+               goto out;
+       }
+
+       *map_opt++ = '\0';
+       if (*map_opt == '\0') {
+               pr_debug("ERROR: Invalid map option: %s\n", term->config);
+               goto out;
+       }
+
+       map = bpf_object__get_map_by_name(obj, map_name);
+       if (!map) {
+               pr_debug("ERROR: Map %s doesn't exist\n", map_name);
+               err = -BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST;
+               goto out;
+       }
+
+       *key_scan_pos += strlen(map_opt);
+       err = config_map_indices_range_check(term, map, map_name);
+       if (err)
+               goto out;
+       *key_scan_pos -= strlen(map_opt);
+
+       for (i = 0; i < ARRAY_SIZE(bpf_obj_config__map_funcs); i++) {
+               struct bpf_obj_config__map_func *func =
+                               &bpf_obj_config__map_funcs[i];
+
+               if (strcmp(map_opt, func->config_opt) == 0) {
+                       err = func->config_func(map, term, evlist);
+                       goto out;
+               }
+       }
+
+       pr_debug("ERROR: Invalid map config option '%s'\n", map_opt);
+       err = -BPF_LOADER_ERRNO__OBJCONF_MAP_OPT;
+out:
+       free(map_name);
+       if (!err)
+               key_scan_pos += strlen(map_opt);
+       return err;
+}
+
+int bpf__config_obj(struct bpf_object *obj,
+                   struct parse_events_term *term,
+                   struct perf_evlist *evlist,
+                   int *error_pos)
+{
+       int key_scan_pos = 0;
+       int err;
+
+       if (!obj || !term || !term->config)
+               return -EINVAL;
+
+       if (!prefixcmp(term->config, "map:")) {
+               key_scan_pos = sizeof("map:") - 1;
+               err = bpf__obj_config_map(obj, term, evlist, &key_scan_pos);
+               goto out;
+       }
+       err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+out:
+       if (error_pos)
+               *error_pos = key_scan_pos;
+       return err;
+
+}
+
+typedef int (*map_config_func_t)(const char *name, int map_fd,
+                                struct bpf_map_def *pdef,
+                                struct bpf_map_op *op,
+                                void *pkey, void *arg);
+
+static int
+foreach_key_array_all(map_config_func_t func,
+                     void *arg, const char *name,
+                     int map_fd, struct bpf_map_def *pdef,
+                     struct bpf_map_op *op)
+{
+       unsigned int i;
+       int err;
+
+       for (i = 0; i < pdef->max_entries; i++) {
+               err = func(name, map_fd, pdef, op, &i, arg);
+               if (err) {
+                       pr_debug("ERROR: failed to insert value to %s[%u]\n",
+                                name, i);
+                       return err;
+               }
+       }
+       return 0;
+}
+
+static int
+foreach_key_array_ranges(map_config_func_t func, void *arg,
+                        const char *name, int map_fd,
+                        struct bpf_map_def *pdef,
+                        struct bpf_map_op *op)
+{
+       unsigned int i, j;
+       int err;
+
+       for (i = 0; i < op->k.array.nr_ranges; i++) {
+               unsigned int start = op->k.array.ranges[i].start;
+               size_t length = op->k.array.ranges[i].length;
+
+               for (j = 0; j < length; j++) {
+                       unsigned int idx = start + j;
+
+                       err = func(name, map_fd, pdef, op, &idx, arg);
+                       if (err) {
+                               pr_debug("ERROR: failed to insert value to %s[%u]\n",
+                                        name, idx);
+                               return err;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int
+bpf_map_config_foreach_key(struct bpf_map *map,
+                          map_config_func_t func,
+                          void *arg)
+{
+       int err, map_fd;
+       const char *name;
+       struct bpf_map_op *op;
+       struct bpf_map_def def;
+       struct bpf_map_priv *priv;
+
+       name = bpf_map__get_name(map);
+
+       err = bpf_map__get_private(map, (void **)&priv);
+       if (err) {
+               pr_debug("ERROR: failed to get private from map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       if (!priv || list_empty(&priv->ops_list)) {
+               pr_debug("INFO: nothing to config for map %s\n", name);
+               return 0;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("ERROR: failed to get definition from map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       map_fd = bpf_map__get_fd(map);
+       if (map_fd < 0) {
+               pr_debug("ERROR: failed to get fd from map %s\n", name);
+               return map_fd;
+       }
+
+       list_for_each_entry(op, &priv->ops_list, list) {
+               switch (def.type) {
+               case BPF_MAP_TYPE_ARRAY:
+               case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+                       switch (op->key_type) {
+                       case BPF_MAP_KEY_ALL:
+                               err = foreach_key_array_all(func, arg, name,
+                                                           map_fd, &def, op);
+                               break;
+                       case BPF_MAP_KEY_RANGES:
+                               err = foreach_key_array_ranges(func, arg, name,
+                                                              map_fd, &def,
+                                                              op);
+                               break;
+                       default:
+                               pr_debug("ERROR: keytype for map '%s' invalid\n",
+                                        name);
+                               return -BPF_LOADER_ERRNO__INTERNAL;
+                       }
+                       if (err)
+                               return err;
+                       break;
+               default:
+                       pr_debug("ERROR: type of '%s' incorrect\n", name);
+                       return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+               }
+       }
+
+       return 0;
+}
+
+static int
+apply_config_value_for_key(int map_fd, void *pkey,
+                          size_t val_size, u64 val)
+{
+       int err = 0;
+
+       switch (val_size) {
+       case 1: {
+               u8 _val = (u8)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 2: {
+               u16 _val = (u16)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 4: {
+               u32 _val = (u32)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 8: {
+               err = bpf_map_update_elem(map_fd, pkey, &val, BPF_ANY);
+               break;
+       }
+       default:
+               pr_debug("ERROR: invalid value size\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+       }
+       if (err && errno)
+               err = -errno;
+       return err;
+}
+
+static int
+apply_config_evsel_for_key(const char *name, int map_fd, void *pkey,
+                          struct perf_evsel *evsel)
+{
+       struct xyarray *xy = evsel->fd;
+       struct perf_event_attr *attr;
+       unsigned int key, events;
+       bool check_pass = false;
+       int *evt_fd;
+       int err;
+
+       if (!xy) {
+               pr_debug("ERROR: evsel not ready for map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       if (xy->row_size / xy->entry_size != 1) {
+               pr_debug("ERROR: Dimension of target event is incorrect for map %s\n",
+                        name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM;
+       }
+
+       attr = &evsel->attr;
+       if (attr->inherit) {
+               pr_debug("ERROR: Can't put inherit event into map %s\n", name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH;
+       }
+
+       if (perf_evsel__is_bpf_output(evsel))
+               check_pass = true;
+       if (attr->type == PERF_TYPE_RAW)
+               check_pass = true;
+       if (attr->type == PERF_TYPE_HARDWARE)
+               check_pass = true;
+       if (!check_pass) {
+               pr_debug("ERROR: Event type is wrong for map %s\n", name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE;
+       }
+
+       events = xy->entries / (xy->row_size / xy->entry_size);
+       key = *((unsigned int *)pkey);
+       if (key >= events) {
+               pr_debug("ERROR: there is no event %d for map %s\n",
+                        key, name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE;
+       }
+       evt_fd = xyarray__entry(xy, key, 0);
+       err = bpf_map_update_elem(map_fd, pkey, evt_fd, BPF_ANY);
+       if (err && errno)
+               err = -errno;
+       return err;
+}
+
+static int
+apply_obj_config_map_for_key(const char *name, int map_fd,
+                            struct bpf_map_def *pdef __maybe_unused,
+                            struct bpf_map_op *op,
+                            void *pkey, void *arg __maybe_unused)
+{
+       int err;
+
+       switch (op->op_type) {
+       case BPF_MAP_OP_SET_VALUE:
+               err = apply_config_value_for_key(map_fd, pkey,
+                                                pdef->value_size,
+                                                op->v.value);
+               break;
+       case BPF_MAP_OP_SET_EVSEL:
+               err = apply_config_evsel_for_key(name, map_fd, pkey,
+                                                op->v.evsel);
+               break;
+       default:
+               pr_debug("ERROR: unknown value type for '%s'\n", name);
+               err = -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       return err;
+}
+
+static int
+apply_obj_config_map(struct bpf_map *map)
+{
+       return bpf_map_config_foreach_key(map,
+                                         apply_obj_config_map_for_key,
+                                         NULL);
+}
+
+static int
+apply_obj_config_object(struct bpf_object *obj)
+{
+       struct bpf_map *map;
+       int err;
+
+       bpf_map__for_each(map, obj) {
+               err = apply_obj_config_map(map);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
+int bpf__apply_obj_config(void)
+{
+       struct bpf_object *obj, *tmp;
+       int err;
+
+       bpf_object__for_each_safe(obj, tmp) {
+               err = apply_obj_config_object(obj);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
  #define ERRNO_OFFSET(e)                ((e) - __BPF_LOADER_ERRNO__START)
  #define ERRCODE_OFFSET(c)      ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
  #define NR_ERRNO       (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -753,6 +1431,20 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
         [ERRCODE_OFFSET(PROLOGUE)]      = "Failed to generate prologue",
         [ERRCODE_OFFSET(PROLOGUE2BIG)]  = "Prologue too big for program",
         [ERRCODE_OFFSET(PROLOGUEOOB)]   = "Offset out of bound for prologue",
+       [ERRCODE_OFFSET(OBJCONF_OPT)]   = "Invalid object config option",
+       [ERRCODE_OFFSET(OBJCONF_CONF)]  = "Config value not set (missing '=')",
+       [ERRCODE_OFFSET(OBJCONF_MAP_OPT)]       = "Invalid object map config option",
+       [ERRCODE_OFFSET(OBJCONF_MAP_NOTEXIST)]  = "Target map doesn't exist",
+       [ERRCODE_OFFSET(OBJCONF_MAP_VALUE)]     = "Incorrect value type for map",
+       [ERRCODE_OFFSET(OBJCONF_MAP_TYPE)]      = "Incorrect map type",
+       [ERRCODE_OFFSET(OBJCONF_MAP_KEYSIZE)]   = "Incorrect map key size",
+       [ERRCODE_OFFSET(OBJCONF_MAP_VALUESIZE)] = "Incorrect map value size",
+       [ERRCODE_OFFSET(OBJCONF_MAP_NOEVT)]     = "Event not found for map setting",
+       [ERRCODE_OFFSET(OBJCONF_MAP_MAPSIZE)]   = "Invalid map size for event setting",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTDIM)]    = "Event dimension too large",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTINH)]    = "Doesn't support inherit event",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTTYPE)]   = "Wrong event type for map",
+       [ERRCODE_OFFSET(OBJCONF_MAP_IDX2BIG)]   = "Index too large",
  };
  
  static int
@@ -872,3 +1564,29 @@ int bpf__strerror_load(struct bpf_object *obj,
         bpf__strerror_end(buf, size);
         return 0;
  }
+
+int bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+                            struct parse_events_term *term __maybe_unused,
+                            struct perf_evlist *evlist __maybe_unused,
+                            int *error_pos __maybe_unused, int err,
+                            char *buf, size_t size)
+{
+       bpf__strerror_head(err, buf, size);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,
+                           "Can't use this config term with this map type");
+       bpf__strerror_end(buf, size);
+       return 0;
+}
+
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
+{
+       bpf__strerror_head(err, buf, size);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,
+                           "Cannot set event to BPF map in multi-thread tracing");
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,
+                           "%s (Hint: use -i to turn off inherit)", emsg);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,
+                           "Can only put raw, hardware and BPF output event into a BPF map");
+       bpf__strerror_end(buf, size);
+       return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h

index 6fdc0457e2b66ea3fedd8e26c4a0e8f6a4bcc6e2..be4311944e3daa2abc87cdd54be72ed7bce70682 100644 (file)
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -10,6 +10,7 @@
  #include <string.h>
  #include <bpf/libbpf.h>
  #include "probe-event.h"
+#include "evlist.h"
  #include "debug.h"
  
  enum bpf_loader_errno {
@@ -24,10 +25,25 @@ enum bpf_loader_errno {
         BPF_LOADER_ERRNO__PROLOGUE,     /* Failed to generate prologue */
         BPF_LOADER_ERRNO__PROLOGUE2BIG, /* Prologue too big for program */
         BPF_LOADER_ERRNO__PROLOGUEOOB,  /* Offset out of bound for prologue */
+       BPF_LOADER_ERRNO__OBJCONF_OPT,  /* Invalid object config option */
+       BPF_LOADER_ERRNO__OBJCONF_CONF, /* Config value not set (lost '=')) */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_OPT,      /* Invalid object map config option */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST, /* Target map not exist */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE,    /* Incorrect value type for map */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,     /* Incorrect map type */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE,  /* Incorrect map key size */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE,/* Incorrect map value size */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT,    /* Event not found for map setting */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE,  /* Invalid map size for event setting */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,   /* Event dimension too large */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,   /* Doesn't support inherit event */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,  /* Wrong event type for map */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG,  /* Index too large */
         __BPF_LOADER_ERRNO__END,
  };
  
  struct bpf_object;
+struct parse_events_term;
  #define PERF_BPF_PROBE_GROUP "perf_bpf_probe"
  
  typedef int (*bpf_prog_iter_callback_t)(struct probe_trace_event *tev,
@@ -53,6 +69,16 @@ int bpf__strerror_load(struct bpf_object *obj, int err,
                        char *buf, size_t size);
  int bpf__foreach_tev(struct bpf_object *obj,
                      bpf_prog_iter_callback_t func, void *arg);
+
+int bpf__config_obj(struct bpf_object *obj, struct parse_events_term *term,
+                   struct perf_evlist *evlist, int *error_pos);
+int bpf__strerror_config_obj(struct bpf_object *obj,
+                            struct parse_events_term *term,
+                            struct perf_evlist *evlist,
+                            int *error_pos, int err, char *buf,
+                            size_t size);
+int bpf__apply_obj_config(void);
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
  #else
  static inline struct bpf_object *
  bpf__prepare_load(const char *filename __maybe_unused,
@@ -83,6 +109,21 @@ bpf__foreach_tev(struct bpf_object *obj __maybe_unused,
         return 0;
  }
  
+static inline int
+bpf__config_obj(struct bpf_object *obj __maybe_unused,
+               struct parse_events_term *term __maybe_unused,
+               struct perf_evlist *evlist __maybe_unused,
+               int *error_pos __maybe_unused)
+{
+       return 0;
+}
+
+static inline int
+bpf__apply_obj_config(void)
+{
+       return 0;
+}
+
  static inline int
  __bpf_strerror(char *buf, size_t size)
  {
@@ -118,5 +159,23 @@ static inline int bpf__strerror_load(struct bpf_object *obj __maybe_unused,
  {
         return __bpf_strerror(buf, size);
  }
+
+static inline int
+bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+                        struct parse_events_term *term __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused,
+                        int *error_pos __maybe_unused,
+                        int err __maybe_unused,
+                        char *buf, size_t size)
+{
+       return __bpf_strerror(buf, size);
+}
+
+static inline int
+bpf__strerror_apply_obj_config(int err __maybe_unused,
+                              char *buf, size_t size)
+{
+       return __bpf_strerror(buf, size);
+}
  #endif
  #endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c

index 6a7e273a514a642b30a477c3119696dc7fa09975..f1479eeef7daf9ae2c386abcef7d079a0370c836 100644 (file)
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -166,6 +166,50 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
         return build_id__filename(build_id_hex, bf, size);
  }
  
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size)
+{
+       char *id_name, *ch;
+       struct stat sb;
+
+       id_name = dso__build_id_filename(dso, bf, size);
+       if (!id_name)
+               goto err;
+       if (access(id_name, F_OK))
+               goto err;
+       if (lstat(id_name, &sb) == -1)
+               goto err;
+       if ((size_t)sb.st_size > size - 1)
+               goto err;
+       if (readlink(id_name, bf, size - 1) < 0)
+               goto err;
+
+       bf[sb.st_size] = '\0';
+
+       /*
+        * link should be:
+        * ../../lib/modules/4.4.0-rc4/kernel/net/ipv4/netfilter/nf_nat_ipv4.ko/a09fe3eb3147dafa4e3b31dbd6257e4d696bdc92
+        */
+       ch = strrchr(bf, '/');
+       if (!ch)
+               goto err;
+       if (ch - 3 < bf)
+               goto err;
+
+       return strncmp(".ko", ch - 3, 3) == 0;
+err:
+       /*
+        * If dso__build_id_filename work, get id_name again,
+        * because id_name points to bf and is broken.
+        */
+       if (id_name)
+               id_name = dso__build_id_filename(dso, bf, size);
+       pr_err("Invalid build id: %s\n", id_name ? :
+                                        dso->long_name ? :
+                                        dso->short_name ? :
+                                        "[unknown]");
+       return false;
+}
+
  #define dsos__for_each_with_build_id(pos, head)        \
         list_for_each_entry(pos, head, node)    \
                 if (!pos->has_build_id)         \
@@ -211,6 +255,7 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
         dsos__for_each_with_build_id(pos, &machine->dsos.head) {
                 const char *name;
                 size_t name_len;
+               bool in_kernel = false;
  
                 if (!pos->hit)
                         continue;
@@ -227,8 +272,11 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
                         name_len = pos->long_name_len + 1;
                 }
  
+               in_kernel = pos->kernel ||
+                               is_kernel_module(name,
+                                       PERF_RECORD_MISC_CPUMODE_UNKNOWN);
                 err = write_buildid(name, name_len, pos->build_id, machine->pid,
-                                   pos->kernel ? kmisc : umisc, fd);
+                                   in_kernel ? kmisc : umisc, fd);
                 if (err)
                         break;
         }
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h

index 27a14a8a945beb8eec9cc389ca7d6545b44a2ac4..64af3e20610d7718ecacfad3810fc4bff4ee882d 100644 (file)
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -16,6 +16,7 @@ int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
  int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
  
  char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size);
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size);
  
  int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
                            struct perf_sample *sample, struct perf_evsel *evsel,
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

index 07b5d63947b11ec5f70029c6e41266e56b2bd8aa..3ca453f0c51f6f5b269f84270e78132e45f9768c 100644 (file)
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -23,6 +23,8 @@
  #define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
  #define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
  
+extern const char *config_exclusive_filename;
+
  typedef int (*config_fn_t)(const char *, const char *, void *);
  extern int perf_default_config(const char *, const char *, void *);
  extern int perf_config(config_fn_t fn, void *);
@@ -31,6 +33,7 @@ extern u64 perf_config_u64(const char *, const char *);
  extern int perf_config_bool(const char *, const char *);
  extern int config_error_nonbool(const char *);
  extern const char *perf_config_dirname(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
  
  char *alias_lookup(const char *alias);
  int split_cmdline(char *cmdline, const char ***argv);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c

index 53c43eb9489e4ba4a4e56a795bc05c2b85fb4cbc..24b4bd0d77545e7bb9f95e83222eb103c92f5151 100644 (file)
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -416,7 +416,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
  /*
   * Fill the node with callchain values
   */
-static void
+static int
  fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
  {
         struct callchain_cursor_node *cursor_node;
@@ -433,7 +433,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
                 call = zalloc(sizeof(*call));
                 if (!call) {
                         perror("not enough memory for the code path tree");
-                       return;
+                       return -1;
                 }
                 call->ip = cursor_node->ip;
                 call->ms.sym = cursor_node->sym;
@@ -443,6 +443,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
                 callchain_cursor_advance(cursor);
                 cursor_node = callchain_cursor_current(cursor);
         }
+       return 0;
  }
  
  static struct callchain_node *
@@ -453,7 +454,19 @@ add_child(struct callchain_node *parent,
         struct callchain_node *new;
  
         new = create_child(parent, false);
-       fill_node(new, cursor);
+       if (new == NULL)
+               return NULL;
+
+       if (fill_node(new, cursor) < 0) {
+               struct callchain_list *call, *tmp;
+
+               list_for_each_entry_safe(call, tmp, &new->val, list) {
+                       list_del(&call->list);
+                       free(call);
+               }
+               free(new);
+               return NULL;
+       }
  
         new->children_hit = 0;
         new->hit = period;
@@ -462,16 +475,32 @@ add_child(struct callchain_node *parent,
         return new;
  }
  
-static s64 match_chain(struct callchain_cursor_node *node,
-                     struct callchain_list *cnode)
+enum match_result {
+       MATCH_ERROR  = -1,
+       MATCH_EQ,
+       MATCH_LT,
+       MATCH_GT,
+};
+
+static enum match_result match_chain(struct callchain_cursor_node *node,
+                                    struct callchain_list *cnode)
  {
         struct symbol *sym = node->sym;
+       u64 left, right;
  
         if (cnode->ms.sym && sym &&
-           callchain_param.key == CCKEY_FUNCTION)
-               return cnode->ms.sym->start - sym->start;
-       else
-               return cnode->ip - node->ip;
+           callchain_param.key == CCKEY_FUNCTION) {
+               left = cnode->ms.sym->start;
+               right = sym->start;
+       } else {
+               left = cnode->ip;
+               right = node->ip;
+       }
+
+       if (left == right)
+               return MATCH_EQ;
+
+       return left > right ? MATCH_GT : MATCH_LT;
  }
  
  /*
@@ -479,7 +508,7 @@ static s64 match_chain(struct callchain_cursor_node *node,
   * give a part of its callchain to the created child.
   * Then create another child to host the given callchain of new branch
   */
-static void
+static int
  split_add_child(struct callchain_node *parent,
                 struct callchain_cursor *cursor,
                 struct callchain_list *to_split,
@@ -491,6 +520,8 @@ split_add_child(struct callchain_node *parent,
  
         /* split */
         new = create_child(parent, true);
+       if (new == NULL)
+               return -1;
  
         /* split the callchain and move a part to the new child */
         old_tail = parent->val.prev;
@@ -524,6 +555,8 @@ split_add_child(struct callchain_node *parent,
  
                 node = callchain_cursor_current(cursor);
                 new = add_child(parent, cursor, period);
+               if (new == NULL)
+                       return -1;
  
                 /*
                  * This is second child since we moved parent's children
@@ -534,7 +567,7 @@ split_add_child(struct callchain_node *parent,
                 cnode = list_first_entry(&first->val, struct callchain_list,
                                          list);
  
-               if (match_chain(node, cnode) < 0)
+               if (match_chain(node, cnode) == MATCH_LT)
                         pp = &p->rb_left;
                 else
                         pp = &p->rb_right;
@@ -545,14 +578,15 @@ split_add_child(struct callchain_node *parent,
                 parent->hit = period;
                 parent->count = 1;
         }
+       return 0;
  }
  
-static int
+static enum match_result
  append_chain(struct callchain_node *root,
              struct callchain_cursor *cursor,
              u64 period);
  
-static void
+static int
  append_chain_children(struct callchain_node *root,
                       struct callchain_cursor *cursor,
                       u64 period)
@@ -564,36 +598,42 @@ append_chain_children(struct callchain_node *root,
  
         node = callchain_cursor_current(cursor);
         if (!node)
-               return;
+               return -1;
  
         /* lookup in childrens */
         while (*p) {
-               s64 ret;
+               enum match_result ret;
  
                 parent = *p;
                 rnode = rb_entry(parent, struct callchain_node, rb_node_in);
  
                 /* If at least first entry matches, rely to children */
                 ret = append_chain(rnode, cursor, period);
-               if (ret == 0)
+               if (ret == MATCH_EQ)
                         goto inc_children_hit;
+               if (ret == MATCH_ERROR)
+                       return -1;
  
-               if (ret < 0)
+               if (ret == MATCH_LT)
                         p = &parent->rb_left;
                 else
                         p = &parent->rb_right;
         }
         /* nothing in children, add to the current node */
         rnode = add_child(root, cursor, period);
+       if (rnode == NULL)
+               return -1;
+
         rb_link_node(&rnode->rb_node_in, parent, p);
         rb_insert_color(&rnode->rb_node_in, &root->rb_root_in);
  
  inc_children_hit:
         root->children_hit += period;
         root->children_count++;
+       return 0;
  }
  
-static int
+static enum match_result
  append_chain(struct callchain_node *root,
              struct callchain_cursor *cursor,
              u64 period)
@@ -602,7 +642,7 @@ append_chain(struct callchain_node *root,
         u64 start = cursor->pos;
         bool found = false;
         u64 matches;
-       int cmp = 0;
+       enum match_result cmp = MATCH_ERROR;
  
         /*
          * Lookup in the current node
@@ -618,7 +658,7 @@ append_chain(struct callchain_node *root,
                         break;
  
                 cmp = match_chain(node, cnode);
-               if (cmp)
+               if (cmp != MATCH_EQ)
                         break;
  
                 found = true;
@@ -628,7 +668,7 @@ append_chain(struct callchain_node *root,
  
         /* matches not, relay no the parent */
         if (!found) {
-               WARN_ONCE(!cmp, "Chain comparison error\n");
+               WARN_ONCE(cmp == MATCH_ERROR, "Chain comparison error\n");
                 return cmp;
         }
  
@@ -636,21 +676,25 @@ append_chain(struct callchain_node *root,
  
         /* we match only a part of the node. Split it and add the new chain */
         if (matches < root->val_nr) {
-               split_add_child(root, cursor, cnode, start, matches, period);
-               return 0;
+               if (split_add_child(root, cursor, cnode, start, matches,
+                                   period) < 0)
+                       return MATCH_ERROR;
+
+               return MATCH_EQ;
         }
  
         /* we match 100% of the path, increment the hit */
         if (matches == root->val_nr && cursor->pos == cursor->nr) {
                 root->hit += period;
                 root->count++;
-               return 0;
+               return MATCH_EQ;
         }
  
         /* We match the node and still have a part remaining */
-       append_chain_children(root, cursor, period);
+       if (append_chain_children(root, cursor, period) < 0)
+               return MATCH_ERROR;
  
-       return 0;
+       return MATCH_EQ;
  }
  
  int callchain_append(struct callchain_root *root,
@@ -662,7 +706,8 @@ int callchain_append(struct callchain_root *root,
  
         callchain_cursor_commit(cursor);
  
-       append_chain_children(&root->node, cursor, period);
+       if (append_chain_children(&root->node, cursor, period) < 0)
+               return -1;
  
         if (cursor->nr > root->max_depth)
                 root->max_depth = cursor->nr;
@@ -690,7 +735,8 @@ merge_chain_branch(struct callchain_cursor *cursor,
  
         if (src->hit) {
                 callchain_cursor_commit(cursor);
-               append_chain_children(dst, cursor, src->hit);
+               if (append_chain_children(dst, cursor, src->hit) < 0)
+                       return -1;
         }
  
         n = rb_first(&src->rb_root_in);
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c

index e5fb88bab9e1c416a3c53fe5036f8eb57f64a55d..43e84aa27e4a6489eafdebc3636d8bfb33334e3f 100644 (file)
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -32,14 +32,15 @@ int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
         return 0;
  }
  
-int perf_color_default_config(const char *var, const char *value, void *cb)
+int perf_color_default_config(const char *var, const char *value,
+                             void *cb __maybe_unused)
  {
         if (!strcmp(var, "color.ui")) {
                 perf_use_color_default = perf_config_colorbool(var, value, -1);
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int __color_vsnprintf(char *bf, size_t size, const char *color,
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c

index d3e12e30e1d520f8073d1f01d17e1eaf4ac30254..4e727635476eadf5b105a7be4620f86a4bf46499 100644 (file)
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -26,7 +26,7 @@ static const char *config_file_name;
  static int config_linenr;
  static int config_file_eof;
  
-static const char *config_exclusive_filename;
+const char *config_exclusive_filename;
  
  static int get_next_char(void)
  {
@@ -434,7 +434,7 @@ static int perf_config_from_file(config_fn_t fn, const char *filename, void *dat
         return ret;
  }
  
-static const char *perf_etc_perfconfig(void)
+const char *perf_etc_perfconfig(void)
  {
         static const char *system_wide;
         if (!system_wide)
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c

index fa935093a599429011fa214c24645fd0230e3b3a..9bcf2bed3a6d1b7369ee4deee7f38e9c4abab06d 100644 (file)
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -8,6 +8,10 @@
  #include <linux/bitmap.h>
  #include "asm/bug.h"
  
+static int max_cpu_num;
+static int max_node_num;
+static int *cpunode_map;
+
  static struct cpu_map *cpu_map__default_new(void)
  {
         struct cpu_map *cpus;
@@ -486,6 +490,32 @@ out:
                 pr_err("Failed to read max nodes, using default of %d\n", max_node_num);
  }
  
+int cpu__max_node(void)
+{
+       if (unlikely(!max_node_num))
+               set_max_node_num();
+
+       return max_node_num;
+}
+
+int cpu__max_cpu(void)
+{
+       if (unlikely(!max_cpu_num))
+               set_max_cpu_num();
+
+       return max_cpu_num;
+}
+
+int cpu__get_node(int cpu)
+{
+       if (unlikely(cpunode_map == NULL)) {
+               pr_debug("cpu_map not initialized\n");
+               return -1;
+       }
+
+       return cpunode_map[cpu];
+}
+
  static int init_cpunode_map(void)
  {
         int i;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h

index 71c41b9efabb3b38dd17a7374413985dfd944f77..81a2562aaa2b02261b88c960997238dc9e0925ab 100644 (file)
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -57,37 +57,11 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
         return map ? map->map[0] == -1 : true;
  }
  
-int max_cpu_num;
-int max_node_num;
-int *cpunode_map;
-
  int cpu__setup_cpunode_map(void);
  
-static inline int cpu__max_node(void)
-{
-       if (unlikely(!max_node_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_node_num;
-}
-
-static inline int cpu__max_cpu(void)
-{
-       if (unlikely(!max_cpu_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_cpu_num;
-}
-
-static inline int cpu__get_node(int cpu)
-{
-       if (unlikely(cpunode_map == NULL)) {
-               pr_debug("cpu_map not initialized\n");
-               return -1;
-       }
-
-       return cpunode_map[cpu];
-}
+int cpu__max_node(void);
+int cpu__max_cpu(void);
+int cpu__get_node(int cpu);
  
  int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
                        int (*f)(struct cpu_map *map, int cpu, void *data),
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c

index aada3ac5e891f85be1f4d25791d4ee17dd859077..d4a5a21c2a7e2e47596444665b5f5828874da337 100644 (file)
--- a/tools/perf/util/ctype.c
+++ b/tools/perf/util/ctype.c
@@ -31,9 +31,18 @@ unsigned char sane_ctype[256] = {
  };
  
  const char *graph_line =
+       "_____________________________________________________________________"
         "_____________________________________________________________________"
         "_____________________________________________________________________";
  const char *graph_dotted_line =
         "---------------------------------------------------------------------"
         "---------------------------------------------------------------------"
         "---------------------------------------------------------------------";
+const char *spaces =
+       "                                                                     "
+       "                                                                     "
+       "                                                                     ";
+const char *dots =
+       "....................................................................."
+       "....................................................................."
+       ".....................................................................";
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c

index 34cd1e4039d35e05be0460ff0a259e7d8b2bb80a..811af89ce0bb8f37b99898dc8b2a2d5f6e183d26 100644 (file)
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -352,6 +352,84 @@ static int add_tracepoint_values(struct ctf_writer *cw,
         return ret;
  }
  
+static int
+add_bpf_output_values(struct bt_ctf_event_class *event_class,
+                     struct bt_ctf_event *event,
+                     struct perf_sample *sample)
+{
+       struct bt_ctf_field_type *len_type, *seq_type;
+       struct bt_ctf_field *len_field, *seq_field;
+       unsigned int raw_size = sample->raw_size;
+       unsigned int nr_elements = raw_size / sizeof(u32);
+       unsigned int i;
+       int ret;
+
+       if (nr_elements * sizeof(u32) != raw_size)
+               pr_warning("Incorrect raw_size (%u) in bpf output event, skip %lu bytes\n",
+                          raw_size, nr_elements * sizeof(u32) - raw_size);
+
+       len_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_len");
+       len_field = bt_ctf_field_create(len_type);
+       if (!len_field) {
+               pr_err("failed to create 'raw_len' for bpf output event\n");
+               ret = -1;
+               goto put_len_type;
+       }
+
+       ret = bt_ctf_field_unsigned_integer_set_value(len_field, nr_elements);
+       if (ret) {
+               pr_err("failed to set field value for raw_len\n");
+               goto put_len_field;
+       }
+       ret = bt_ctf_event_set_payload(event, "raw_len", len_field);
+       if (ret) {
+               pr_err("failed to set payload to raw_len\n");
+               goto put_len_field;
+       }
+
+       seq_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_data");
+       seq_field = bt_ctf_field_create(seq_type);
+       if (!seq_field) {
+               pr_err("failed to create 'raw_data' for bpf output event\n");
+               ret = -1;
+               goto put_seq_type;
+       }
+
+       ret = bt_ctf_field_sequence_set_length(seq_field, len_field);
+       if (ret) {
+               pr_err("failed to set length of 'raw_data'\n");
+               goto put_seq_field;
+       }
+
+       for (i = 0; i < nr_elements; i++) {
+               struct bt_ctf_field *elem_field =
+                       bt_ctf_field_sequence_get_field(seq_field, i);
+
+               ret = bt_ctf_field_unsigned_integer_set_value(elem_field,
+                               ((u32 *)(sample->raw_data))[i]);
+
+               bt_ctf_field_put(elem_field);
+               if (ret) {
+                       pr_err("failed to set raw_data[%d]\n", i);
+                       goto put_seq_field;
+               }
+       }
+
+       ret = bt_ctf_event_set_payload(event, "raw_data", seq_field);
+       if (ret)
+               pr_err("failed to set payload for raw_data\n");
+
+put_seq_field:
+       bt_ctf_field_put(seq_field);
+put_seq_type:
+       bt_ctf_field_type_put(seq_type);
+put_len_field:
+       bt_ctf_field_put(len_field);
+put_len_type:
+       bt_ctf_field_type_put(len_type);
+       return ret;
+}
+
  static int add_generic_values(struct ctf_writer *cw,
                               struct bt_ctf_event *event,
                               struct perf_evsel *evsel,
@@ -597,6 +675,12 @@ static int process_sample_event(struct perf_tool *tool,
                         return -1;
         }
  
+       if (perf_evsel__is_bpf_output(evsel)) {
+               ret = add_bpf_output_values(event_class, event, sample);
+               if (ret)
+                       return -1;
+       }
+
         cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
         if (cs) {
                 if (is_flush_needed(cs))
@@ -744,6 +828,25 @@ static int add_tracepoint_types(struct ctf_writer *cw,
         return ret;
  }
  
+static int add_bpf_output_types(struct ctf_writer *cw,
+                               struct bt_ctf_event_class *class)
+{
+       struct bt_ctf_field_type *len_type = cw->data.u32;
+       struct bt_ctf_field_type *seq_base_type = cw->data.u32_hex;
+       struct bt_ctf_field_type *seq_type;
+       int ret;
+
+       ret = bt_ctf_event_class_add_field(class, len_type, "raw_len");
+       if (ret)
+               return ret;
+
+       seq_type = bt_ctf_field_type_sequence_create(seq_base_type, "raw_len");
+       if (!seq_type)
+               return -1;
+
+       return bt_ctf_event_class_add_field(class, seq_type, "raw_data");
+}
+
  static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
                              struct bt_ctf_event_class *event_class)
  {
@@ -755,7 +858,8 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
          *                              ctf event header
          *   PERF_SAMPLE_READ         - TODO
          *   PERF_SAMPLE_CALLCHAIN    - TODO
-        *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
+        *   PERF_SAMPLE_RAW          - tracepoint fields and BPF output
+        *                              are handled separately
          *   PERF_SAMPLE_BRANCH_STACK - TODO
          *   PERF_SAMPLE_REGS_USER    - TODO
          *   PERF_SAMPLE_STACK_USER   - TODO
@@ -824,6 +928,12 @@ static int add_event(struct ctf_writer *cw, struct perf_evsel *evsel)
                         goto err;
         }
  
+       if (perf_evsel__is_bpf_output(evsel)) {
+               ret = add_bpf_output_types(cw, event_class);
+               if (ret)
+                       goto err;
+       }
+
         ret = bt_ctf_stream_class_add_event_class(cw->stream_class, event_class);
         if (ret) {
                 pr("Failed to add event class into stream.\n");
@@ -858,6 +968,23 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
         return 0;
  }
  
+static void cleanup_events(struct perf_session *session)
+{
+       struct perf_evlist *evlist = session->evlist;
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               struct evsel_priv *priv;
+
+               priv = evsel->priv;
+               bt_ctf_event_class_put(priv->event_class);
+               zfree(&evsel->priv);
+       }
+
+       perf_evlist__delete(evlist);
+       session->evlist = NULL;
+}
+
  static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
  {
         struct ctf_stream **stream;
@@ -953,6 +1080,12 @@ static struct bt_ctf_field_type *create_int_type(int size, bool sign, bool hex)
             bt_ctf_field_type_integer_set_base(type, BT_CTF_INTEGER_BASE_HEXADECIMAL))
                 goto err;
  
+#if __BYTE_ORDER == __BIG_ENDIAN
+       bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_BIG_ENDIAN);
+#else
+       bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_LITTLE_ENDIAN);
+#endif
+
         pr2("Created type: INTEGER %d-bit %ssigned %s\n",
             size, sign ? "un" : "", hex ? "hex" : "");
         return type;
@@ -1100,7 +1233,7 @@ static int convert__config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  int bt_convert__perf2ctf(const char *input, const char *path, bool force)
@@ -1171,6 +1304,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
                 (double) c.events_size / 1024.0 / 1024.0,
                 c.events_count);
  
+       cleanup_events(session);
         perf_session__delete(session);
         ctf_writer__cleanup(cw);
  
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c

index 86d9c73025983d0132a4dddc8607161d713bfbf2..8c4212abd19b48b9e84ca1f9978f06561c05576a 100644 (file)
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -5,6 +5,7 @@
  #include <string.h>
  #include <stdarg.h>
  #include <stdio.h>
+#include <api/debug.h>
  
  #include "cache.h"
  #include "color.h"
@@ -22,7 +23,7 @@ int debug_ordered_events;
  static int redirect_to_stderr;
  int debug_data_convert;
  
-static int _eprintf(int level, int var, const char *fmt, va_list args)
+int veprintf(int level, int var, const char *fmt, va_list args)
  {
         int ret = 0;
  
@@ -36,24 +37,19 @@ static int _eprintf(int level, int var, const char *fmt, va_list args)
         return ret;
  }
  
-int veprintf(int level, int var, const char *fmt, va_list args)
-{
-       return _eprintf(level, var, fmt, args);
-}
-
  int eprintf(int level, int var, const char *fmt, ...)
  {
         va_list args;
         int ret;
  
         va_start(args, fmt);
-       ret = _eprintf(level, var, fmt, args);
+       ret = veprintf(level, var, fmt, args);
         va_end(args);
  
         return ret;
  }
  
-static int __eprintf_time(u64 t, const char *fmt, va_list args)
+static int veprintf_time(u64 t, const char *fmt, va_list args)
  {
         int ret = 0;
         u64 secs, usecs, nsecs = t;
@@ -75,7 +71,7 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...)
  
         if (var >= level) {
                 va_start(args, fmt);
-               ret = __eprintf_time(t, fmt, args);
+               ret = veprintf_time(t, fmt, args);
                 va_end(args);
         }
  
@@ -91,7 +87,7 @@ void pr_stat(const char *fmt, ...)
         va_list args;
  
         va_start(args, fmt);
-       _eprintf(1, verbose, fmt, args);
+       veprintf(1, verbose, fmt, args);
         va_end(args);
         eprintf(1, verbose, "\n");
  }
@@ -110,40 +106,61 @@ int dump_printf(const char *fmt, ...)
         return ret;
  }
  
+static void trace_event_printer(enum binary_printer_ops op,
+                               unsigned int val, void *extra)
+{
+       const char *color = PERF_COLOR_BLUE;
+       union perf_event *event = (union perf_event *)extra;
+       unsigned char ch = (unsigned char)val;
+
+       switch (op) {
+       case BINARY_PRINT_DATA_BEGIN:
+               printf(".");
+               color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+                               event->header.size);
+               break;
+       case BINARY_PRINT_LINE_BEGIN:
+               printf(".");
+               break;
+       case BINARY_PRINT_ADDR:
+               color_fprintf(stdout, color, "  %04x: ", val);
+               break;
+       case BINARY_PRINT_NUM_DATA:
+               color_fprintf(stdout, color, " %02x", val);
+               break;
+       case BINARY_PRINT_NUM_PAD:
+               color_fprintf(stdout, color, "   ");
+               break;
+       case BINARY_PRINT_SEP:
+               color_fprintf(stdout, color, "  ");
+               break;
+       case BINARY_PRINT_CHAR_DATA:
+               color_fprintf(stdout, color, "%c",
+                             isprint(ch) ? ch : '.');
+               break;
+       case BINARY_PRINT_CHAR_PAD:
+               color_fprintf(stdout, color, " ");
+               break;
+       case BINARY_PRINT_LINE_END:
+               color_fprintf(stdout, color, "\n");
+               break;
+       case BINARY_PRINT_DATA_END:
+               printf("\n");
+               break;
+       default:
+               break;
+       }
+}
+
  void trace_event(union perf_event *event)
  {
         unsigned char *raw_event = (void *)event;
-       const char *color = PERF_COLOR_BLUE;
-       int i, j;
  
         if (!dump_trace)
                 return;
  
-       printf(".");
-       color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
-                     event->header.size);
-
-       for (i = 0; i < event->header.size; i++) {
-               if ((i & 15) == 0) {
-                       printf(".");
-                       color_fprintf(stdout, color, "  %04x: ", i);
-               }
-
-               color_fprintf(stdout, color, " %02x", raw_event[i]);
-
-               if (((i & 15) == 15) || i == event->header.size-1) {
-                       color_fprintf(stdout, color, "  ");
-                       for (j = 0; j < 15-(i & 15); j++)
-                               color_fprintf(stdout, color, "   ");
-                       for (j = i & ~15; j <= i; j++) {
-                               color_fprintf(stdout, color, "%c",
-                                             isprint(raw_event[j]) ?
-                                             raw_event[j] : '.');
-                       }
-                       color_fprintf(stdout, color, "\n");
-               }
-       }
-       printf(".\n");
+       print_binary(raw_event, event->header.size, 16,
+                    trace_event_printer, event);
  }
  
  static struct debug_variable {
@@ -192,3 +209,23 @@ int perf_debug_option(const char *str)
         free(s);
         return 0;
  }
+
+#define DEBUG_WRAPPER(__n, __l)                                \
+static int pr_ ## __n ## _wrapper(const char *fmt, ...)        \
+{                                                      \
+       va_list args;                                   \
+       int ret;                                        \
+                                                       \
+       va_start(args, fmt);                            \
+       ret = veprintf(__l, verbose, fmt, args);        \
+       va_end(args);                                   \
+       return ret;                                     \
+}
+
+DEBUG_WRAPPER(warning, 0);
+DEBUG_WRAPPER(debug, 1);
+
+void perf_debug_setup(void)
+{
+       libapi_set_print(pr_warning_wrapper, pr_warning_wrapper, pr_debug_wrapper);
+}
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h

index 8b9a088c32ab4e330ece47ae91397bdd87bdd0ee..14bafda79edaeba1bbd3e007b5faa8607d0570bf 100644 (file)
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -53,5 +53,6 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__(
  int veprintf(int level, int var, const char *fmt, va_list args);
  
  int perf_debug_option(const char *str);
+void perf_debug_setup(void);
  
  #endif /* __PERF_DEBUG_H */
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c

new file mode 100644 (file)

index 0000000..3e6062a
--- /dev/null
+++ b/tools/perf/util/demangle-java.c
@@ -0,0 +1,199 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include "util.h"
+#include "debug.h"
+#include "symbol.h"
+
+#include "demangle-java.h"
+
+enum {
+       MODE_PREFIX = 0,
+       MODE_CLASS  = 1,
+       MODE_FUNC   = 2,
+       MODE_TYPE   = 3,
+       MODE_CTYPE  = 3, /* class arg */
+};
+
+#define BASE_ENT(c, n) [c - 'A']=n
+static const char *base_types['Z' - 'A' + 1] = {
+       BASE_ENT('B', "byte" ),
+       BASE_ENT('C', "char" ),
+       BASE_ENT('D', "double" ),
+       BASE_ENT('F', "float" ),
+       BASE_ENT('I', "int" ),
+       BASE_ENT('J', "long" ),
+       BASE_ENT('S', "short" ),
+       BASE_ENT('Z', "bool" ),
+};
+
+/*
+ * demangle Java symbol between str and end positions and stores
+ * up to maxlen characters into buf. The parser starts in mode.
+ *
+ * Use MODE_PREFIX to process entire prototype till end position
+ * Use MODE_TYPE to process return type if str starts on return type char
+ *
+ *  Return:
+ *     success: buf
+ *     error  : NULL
+ */
+static char *
+__demangle_java_sym(const char *str, const char *end, char *buf, int maxlen, int mode)
+{
+       int rlen = 0;
+       int array = 0;
+       int narg = 0;
+       const char *q;
+
+       if (!end)
+               end = str + strlen(str);
+
+       for (q = str; q != end; q++) {
+
+               if (rlen == (maxlen - 1))
+                       break;
+
+               switch (*q) {
+               case 'L':
+                       if (mode == MODE_PREFIX || mode == MODE_CTYPE) {
+                               if (mode == MODE_CTYPE) {
+                                       if (narg)
+                                               rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+                                       narg++;
+                               }
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "class ");
+                               if (mode == MODE_PREFIX)
+                                       mode = MODE_CLASS;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case 'B':
+               case 'C':
+               case 'D':
+               case 'F':
+               case 'I':
+               case 'J':
+               case 'S':
+               case 'Z':
+                       if (mode == MODE_TYPE) {
+                               if (narg)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "%s", base_types[*q - 'A']);
+                               while (array--)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+                               array = 0;
+                               narg++;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case 'V':
+                       if (mode == MODE_TYPE) {
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "void");
+                               while (array--)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+                               array = 0;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case '[':
+                       if (mode != MODE_TYPE)
+                               goto error;
+                       array++;
+                       break;
+               case '(':
+                       if (mode != MODE_FUNC)
+                               goto error;
+                       buf[rlen++] = *q;
+                       mode = MODE_TYPE;
+                       break;
+               case ')':
+                       if (mode != MODE_TYPE)
+                               goto error;
+                       buf[rlen++] = *q;
+                       narg = 0;
+                       break;
+               case ';':
+                       if (mode != MODE_CLASS && mode != MODE_CTYPE)
+                               goto error;
+                       /* safe because at least one other char to process */
+                       if (isalpha(*(q + 1)))
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+                       if (mode == MODE_CLASS)
+                               mode = MODE_FUNC;
+                       else if (mode == MODE_CTYPE)
+                               mode = MODE_TYPE;
+                       break;
+               case '/':
+                       if (mode != MODE_CLASS && mode != MODE_CTYPE)
+                               goto error;
+                       rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+                       break;
+               default :
+                       buf[rlen++] = *q;
+               }
+       }
+       buf[rlen] = '\0';
+       return buf;
+error:
+       return NULL;
+}
+
+/*
+ * Demangle Java function signature (openJDK, not GCJ)
+ * input:
+ *     str: string to parse. String is not modified
+ *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ * return:
+ *     if input can be demangled, then a newly allocated string is returned.
+ *     if input cannot be demangled, then NULL is returned
+ *
+ * Note: caller is responsible for freeing demangled string
+ */
+char *
+java_demangle_sym(const char *str, int flags)
+{
+       char *buf, *ptr;
+       char *p;
+       size_t len, l1 = 0;
+
+       if (!str)
+               return NULL;
+
+       /* find start of retunr type */
+       p = strrchr(str, ')');
+       if (!p)
+               return NULL;
+
+       /*
+        * expansion factor estimated to 3x
+        */
+       len = strlen(str) * 3 + 1;
+       buf = malloc(len);
+       if (!buf)
+               return NULL;
+
+       buf[0] = '\0';
+       if (!(flags & JAVA_DEMANGLE_NORET)) {
+               /*
+                * get return type first
+                */
+               ptr = __demangle_java_sym(p + 1, NULL, buf, len, MODE_TYPE);
+               if (!ptr)
+                       goto error;
+
+               /* add space between return type and function prototype */
+               l1 = strlen(buf);
+               buf[l1++] = ' ';
+       }
+
+       /* process function up to return type */
+       ptr = __demangle_java_sym(str, p + 1, buf + l1, len - l1, MODE_PREFIX);
+       if (!ptr)
+               goto error;
+
+       return buf;
+error:
+       free(buf);
+       return NULL;
+}
diff --git a/tools/perf/util/demangle-java.h b/tools/perf/util/demangle-java.h

new file mode 100644 (file)

index 0000000..a981c1f
--- /dev/null
+++ b/tools/perf/util/demangle-java.h
@@ -0,0 +1,10 @@
+#ifndef __PERF_DEMANGLE_JAVA
+#define __PERF_DEMANGLE_JAVA 1
+/*
+ * demangle function flags
+ */
+#define JAVA_DEMANGLE_NORET    0x1 /* do not process return type */
+
+char * java_demangle_sym(const char *str, int flags);
+
+#endif /* __PERF_DEMANGLE_JAVA */
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c

index e8e9a9dbf5e395a20d589c80b253c2d869b927c1..8e6395439ca0830cefaaa5b6dbe905ae2af93011 100644 (file)
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -52,6 +52,11 @@ int dso__read_binary_type_filename(const struct dso *dso,
                         debuglink--;
                 if (*debuglink == '/')
                         debuglink++;
+
+               ret = -1;
+               if (!is_regular_file(filename))
+                       break;
+
                 ret = filename__read_debuglink(filename, debuglink,
                                                size - (debuglink - filename));
                 }
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c

index 7dd5939dea2e58385bd374a3e59aecf766e799c1..49a11d9d8b8f050efa48715cfba9e54731f63eef 100644 (file)
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -6,6 +6,8 @@ struct perf_env perf_env;
  
  void perf_env__exit(struct perf_env *env)
  {
+       int i;
+
         zfree(&env->hostname);
         zfree(&env->os_release);
         zfree(&env->version);
@@ -19,6 +21,10 @@ void perf_env__exit(struct perf_env *env)
         zfree(&env->numa_nodes);
         zfree(&env->pmu_mappings);
         zfree(&env->cpu);
+
+       for (i = 0; i < env->caches_cnt; i++)
+               cpu_cache_level__free(&env->caches[i]);
+       zfree(&env->caches);
  }
  
  int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[])
@@ -75,3 +81,10 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
         env->nr_cpus_avail = nr_cpus;
         return 0;
  }
+
+void cpu_cache_level__free(struct cpu_cache_level *cache)
+{
+       free(cache->type);
+       free(cache->map);
+       free(cache->size);
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h

index 0132b9557c02b56f7f31e3e51981d55b0d1027bd..56cffb60a0b42e456a11fb2bd489d74a69893b0c 100644 (file)
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -1,11 +1,23 @@
  #ifndef __PERF_ENV_H
  #define __PERF_ENV_H
  
+#include <linux/types.h>
+
  struct cpu_topology_map {
         int     socket_id;
         int     core_id;
  };
  
+struct cpu_cache_level {
+       u32     level;
+       u32     line_size;
+       u32     sets;
+       u32     ways;
+       char    *type;
+       char    *size;
+       char    *map;
+};
+
  struct perf_env {
         char                    *hostname;
         char                    *os_release;
@@ -31,6 +43,8 @@ struct perf_env {
         char                    *numa_nodes;
         char                    *pmu_mappings;
         struct cpu_topology_map *cpu;
+       struct cpu_cache_level  *caches;
+       int                      caches_cnt;
  };
  
  extern struct perf_env perf_env;
@@ -41,4 +55,5 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]);
  
  int perf_env__read_cpu_topology_map(struct perf_env *env);
  
+void cpu_cache_level__free(struct cpu_cache_level *cache);
  #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c

index 85155e91b61ba9b70feaf867b109270df498b9b8..7bad5c3fa7b7175862f5e3bdf38ffca0e1b14ee1 100644 (file)
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -282,7 +282,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                 strcpy(execname, "");
  
                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n",
+               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %[^\n]\n",
                        &event->mmap2.start, &event->mmap2.len, prot,
                        &event->mmap2.pgoff, &event->mmap2.maj,
                        &event->mmap2.min,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c

index d81f13de24769963f6c32c3f809a714e6868dc5e..86a03836a83fc3f8ee8648d83317b8d91e3f48d8 100644 (file)
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1181,12 +1181,12 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
          */
         if (cpus != evlist->cpus) {
                 cpu_map__put(evlist->cpus);
-               evlist->cpus = cpus;
+               evlist->cpus = cpu_map__get(cpus);
         }
  
         if (threads != evlist->threads) {
                 thread_map__put(evlist->threads);
-               evlist->threads = threads;
+               evlist->threads = thread_map__get(threads);
         }
  
         perf_evlist__propagate_maps(evlist);
@@ -1223,6 +1223,9 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter)
         int err = 0;
  
         evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+                       continue;
+
                 err = perf_evsel__set_filter(evsel, filter);
                 if (err)
                         break;
@@ -1624,7 +1627,7 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp)
         return printed + fprintf(fp, "\n");
  }
  
-int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
+int perf_evlist__strerror_open(struct perf_evlist *evlist,
                                int err, char *buf, size_t size)
  {
         int printed, value;
@@ -1652,7 +1655,25 @@ int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
                                     "Hint:\tTry: 'sudo sh -c \"echo -1 > /proc/sys/kernel/perf_event_paranoid\"'\n"
                                     "Hint:\tThe current value is %d.", value);
                 break;
+       case EINVAL: {
+               struct perf_evsel *first = perf_evlist__first(evlist);
+               int max_freq;
+
+               if (sysctl__read_int("kernel/perf_event_max_sample_rate", &max_freq) < 0)
+                       goto out_default;
+
+               if (first->attr.sample_freq < (u64)max_freq)
+                       goto out_default;
+
+               printed = scnprintf(buf, size,
+                                   "Error:\t%s.\n"
+                                   "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n"
+                                   "Hint:\tThe current value is %d and %" PRIu64 " is being requested.",
+                                   emsg, max_freq, first->attr.sample_freq);
+               break;
+       }
         default:
+out_default:
                 scnprintf(buf, size, "%s", emsg);
                 break;
         }
@@ -1723,3 +1744,19 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
  
         tracking_evsel->tracking = true;
  }
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist,
+                              const char *str)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               if (!evsel->name)
+                       continue;
+               if (strcmp(str, evsel->name) == 0)
+                       return evsel;
+       }
+
+       return NULL;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h

index 7c4d9a2067769b0e3de2d96a11e6f7dd7d62106a..a0d15221db6e878412126f1ec5de08cb030f132f 100644 (file)
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -294,4 +294,7 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
                                      struct perf_evsel *tracking_evsel);
  
  void perf_event_attr__set_max_precise_ip(struct perf_event_attr *attr);
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist, const char *str);
  #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index cdbaf9b51e428ad4537a38ded24ee07bec54e90c..0902fe418754ec0c3149d3daeafc122fee65e243 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -225,6 +225,11 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
         if (evsel != NULL)
                 perf_evsel__init(evsel, attr, idx);
  
+       if (perf_evsel__is_bpf_output(evsel)) {
+               evsel->attr.sample_type |= PERF_SAMPLE_RAW;
+               evsel->attr.sample_period = 1;
+       }
+
         return evsel;
  }
  
@@ -898,6 +903,16 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
         if (evsel->precise_max)
                 perf_event_attr__set_max_precise_ip(attr);
  
+       if (opts->all_user) {
+               attr->exclude_kernel = 1;
+               attr->exclude_user   = 0;
+       }
+
+       if (opts->all_kernel) {
+               attr->exclude_kernel = 0;
+               attr->exclude_user   = 1;
+       }
+
         /*
          * Apply event specific term settings,
          * it overloads any global configuration.
@@ -2362,12 +2377,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
         case EPERM:
         case EACCES:
                 return scnprintf(msg, size,
-                "You may not have permission to collect %sstats.\n"
-                "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-                " -1 - Not paranoid at all\n"
-                "  0 - Disallow raw tracepoint access for unpriv\n"
-                "  1 - Disallow cpu events for unpriv\n"
-                "  2 - Disallow kernel profiling for unpriv",
+                "You may not have permission to collect %sstats.\n\n"
+                "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
+                "which controls use of the performance events system by\n"
+                "unprivileged users (without CAP_SYS_ADMIN).\n\n"
+                "The default value is 1:\n\n"
+                "  -1: Allow use of (almost) all events by all users\n"
+                ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+                ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
+                ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
                                  target->system_wide ? "system-wide " : "");
         case ENOENT:
                 return scnprintf(msg, size, "The %s event is not supported.",
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index 8e75434bd01c671a8ed2e0c0b03a139212d7001c..501ea6e565f13a4a4817947957c79f15d805d130 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -93,10 +93,8 @@ struct perf_evsel {
         const char              *unit;
         struct event_format     *tp_format;
         off_t                   id_offset;
-       union {
-               void            *priv;
-               u64             db_id;
-       };
+       void                    *priv;
+       u64                     db_id;
         struct cgroup_sel       *cgrp;
         void                    *handler;
         struct cpu_map          *cpus;
@@ -364,6 +362,14 @@ static inline bool perf_evsel__is_function_event(struct perf_evsel *evsel)
  #undef FUNCTION_EVENT
  }
  
+static inline bool perf_evsel__is_bpf_output(struct perf_evsel *evsel)
+{
+       struct perf_event_attr *attr = &evsel->attr;
+
+       return (attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
+               (attr->type == PERF_TYPE_SOFTWARE);
+}
+
  struct perf_attr_details {
         bool freq;
         bool verbose;
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c

new file mode 100644 (file)

index 0000000..c1ef805
--- /dev/null
+++ b/tools/perf/util/genelf.c
@@ -0,0 +1,449 @@
+/*
+ * genelf.c
+ * Copyright (C) 2014, Google, Inc
+ *
+ * Contributed by:
+ *     Stephane Eranian <eranian@gmail.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define JVMTI
+
+#define BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef HAVE_LIBCRYPTO
+
+#define BUILD_ID_MD5
+#undef BUILD_ID_SHA    /* does not seem to work well when linked with Java */
+#undef BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef BUILD_ID_SHA
+#include <openssl/sha.h>
+#endif
+
+#ifdef BUILD_ID_MD5
+#include <openssl/md5.h>
+#endif
+#endif
+
+
+typedef struct {
+  unsigned int namesz;  /* Size of entry's owner string */
+  unsigned int descsz;  /* Size of the note descriptor */
+  unsigned int type;    /* Interpretation of the descriptor */
+  char         name[0]; /* Start of the name+desc data */
+} Elf_Note;
+
+struct options {
+       char *output;
+       int fd;
+};
+
+static char shd_string_table[] = {
+       0,
+       '.', 't', 'e', 'x', 't', 0,                     /*  1 */
+       '.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', 0, /*  7 */
+       '.', 's', 'y', 'm', 't', 'a', 'b', 0,           /* 17 */
+       '.', 's', 't', 'r', 't', 'a', 'b', 0,           /* 25 */
+       '.', 'n', 'o', 't', 'e', '.', 'g', 'n', 'u', '.', 'b', 'u', 'i', 'l', 'd', '-', 'i', 'd', 0, /* 33 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'l', 'i', 'n', 'e', 0, /* 52 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'i', 'n', 'f', 'o', 0, /* 64 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'a', 'b', 'b', 'r', 'e', 'v', 0, /* 76 */
+};
+
+static struct buildid_note {
+       Elf_Note desc;          /* descsz: size of build-id, must be multiple of 4 */
+       char     name[4];       /* GNU\0 */
+       char     build_id[20];
+} bnote;
+
+static Elf_Sym symtab[]={
+       /* symbol 0 MUST be the undefined symbol */
+       { .st_name  = 0, /* index in sym_string table */
+         .st_info  = ELF_ST_TYPE(STT_NOTYPE),
+         .st_shndx = 0, /* for now */
+         .st_value = 0x0,
+         .st_other = ELF_ST_VIS(STV_DEFAULT),
+         .st_size  = 0,
+       },
+       { .st_name  = 1, /* index in sym_string table */
+         .st_info  = ELF_ST_BIND(STB_LOCAL) | ELF_ST_TYPE(STT_FUNC),
+         .st_shndx = 1,
+         .st_value = 0, /* for now */
+         .st_other = ELF_ST_VIS(STV_DEFAULT),
+         .st_size  = 0, /* for now */
+       }
+};
+
+#ifdef BUILD_ID_URANDOM
+static void
+gen_build_id(struct buildid_note *note,
+            unsigned long load_addr __maybe_unused,
+            const void *code __maybe_unused,
+            size_t csize __maybe_unused)
+{
+       int fd;
+       size_t sz = sizeof(note->build_id);
+       ssize_t sret;
+
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd == -1)
+               err(1, "cannot access /dev/urandom for builid");
+
+       sret = read(fd, note->build_id, sz);
+
+       close(fd);
+
+       if (sret != (ssize_t)sz)
+               memset(note->build_id, 0, sz);
+}
+#endif
+
+#ifdef BUILD_ID_SHA
+static void
+gen_build_id(struct buildid_note *note,
+            unsigned long load_addr __maybe_unused,
+            const void *code,
+            size_t csize)
+{
+       if (sizeof(note->build_id) < SHA_DIGEST_LENGTH)
+               errx(1, "build_id too small for SHA1");
+
+       SHA1(code, csize, (unsigned char *)note->build_id);
+}
+#endif
+
+#ifdef BUILD_ID_MD5
+static void
+gen_build_id(struct buildid_note *note, unsigned long load_addr, const void *code, size_t csize)
+{
+       MD5_CTX context;
+
+       if (sizeof(note->build_id) < 16)
+               errx(1, "build_id too small for MD5");
+
+       MD5_Init(&context);
+       MD5_Update(&context, &load_addr, sizeof(load_addr));
+       MD5_Update(&context, code, csize);
+       MD5_Final((unsigned char *)note->build_id, &context);
+}
+#endif
+
+/*
+ * fd: file descriptor open for writing for the output file
+ * load_addr: code load address (could be zero, just used for buildid)
+ * sym: function name (for native code - used as the symbol)
+ * code: the native code
+ * csize: the code size in bytes
+ */
+int
+jit_write_elf(int fd, uint64_t load_addr, const char *sym,
+             const void *code, int csize,
+             void *debug, int nr_debug_entries)
+{
+       Elf *e;
+       Elf_Data *d;
+       Elf_Scn *scn;
+       Elf_Ehdr *ehdr;
+       Elf_Shdr *shdr;
+       char *strsym = NULL;
+       int symlen;
+       int retval = -1;
+
+       if (elf_version(EV_CURRENT) == EV_NONE) {
+               warnx("ELF initialization failed");
+               return -1;
+       }
+
+       e = elf_begin(fd, ELF_C_WRITE, NULL);
+       if (!e) {
+               warnx("elf_begin failed");
+               goto error;
+       }
+
+       /*
+        * setup ELF header
+        */
+       ehdr = elf_newehdr(e);
+       if (!ehdr) {
+               warnx("cannot get ehdr");
+               goto error;
+       }
+
+       ehdr->e_ident[EI_DATA] = GEN_ELF_ENDIAN;
+       ehdr->e_ident[EI_CLASS] = GEN_ELF_CLASS;
+       ehdr->e_machine = GEN_ELF_ARCH;
+       ehdr->e_type = ET_DYN;
+       ehdr->e_entry = GEN_ELF_TEXT_OFFSET;
+       ehdr->e_version = EV_CURRENT;
+       ehdr->e_shstrndx= 2; /* shdr index for section name */
+
+       /*
+        * setup text section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 16;
+       d->d_off = 0LL;
+       d->d_buf = (void *)code;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = csize;
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 1;
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = GEN_ELF_TEXT_OFFSET;
+       shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup section headers string table
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = shd_string_table;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = sizeof(shd_string_table);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 7; /* offset of '.shstrtab' in shd_string_table */
+       shdr->sh_type = SHT_STRTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup symtab section
+        */
+       symtab[1].st_size  = csize;
+       symtab[1].st_value = GEN_ELF_TEXT_OFFSET;
+
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 8;
+       d->d_off = 0LL;
+       d->d_buf = symtab;
+       d->d_type = ELF_T_SYM;
+       d->d_size = sizeof(symtab);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 17; /* offset of '.symtab' in shd_string_table */
+       shdr->sh_type = SHT_SYMTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = sizeof(Elf_Sym);
+       shdr->sh_link = 4; /* index of .strtab section */
+
+       /*
+        * setup symbols string table
+        * 2 = 1 for 0 in 1st entry, 1 for the 0 at end of symbol for 2nd entry
+        */
+       symlen = 2 + strlen(sym);
+       strsym = calloc(1, symlen);
+       if (!strsym) {
+               warnx("cannot allocate strsym");
+               goto error;
+       }
+       strcpy(strsym + 1, sym);
+
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = strsym;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = symlen;
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 25; /* offset in shd_string_table */
+       shdr->sh_type = SHT_STRTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup build-id section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       /*
+        * build-id generation
+        */
+       gen_build_id(&bnote, load_addr, code, csize);
+       bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */
+       bnote.desc.descsz = sizeof(bnote.build_id);
+       bnote.desc.type   = NT_GNU_BUILD_ID;
+       strcpy(bnote.name, "GNU");
+
+       d->d_align = 4;
+       d->d_off = 0LL;
+       d->d_buf = &bnote;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = sizeof(bnote);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 33; /* offset in shd_string_table */
+       shdr->sh_type = SHT_NOTE;
+       shdr->sh_addr = 0x0;
+       shdr->sh_flags = SHF_ALLOC;
+       shdr->sh_size = sizeof(bnote);
+       shdr->sh_entsize = 0;
+
+       if (debug && nr_debug_entries) {
+               retval = jit_add_debug_info(e, load_addr, debug, nr_debug_entries);
+               if (retval)
+                       goto error;
+       } else {
+               if (elf_update(e, ELF_C_WRITE) < 0) {
+                       warnx("elf_update 4 failed");
+                       goto error;
+               }
+       }
+
+       retval = 0;
+error:
+       (void)elf_end(e);
+
+       free(strsym);
+
+
+       return retval;
+}
+
+#ifndef JVMTI
+
+static unsigned char x86_code[] = {
+    0xBB, 0x2A, 0x00, 0x00, 0x00, /* movl $42, %ebx */
+    0xB8, 0x01, 0x00, 0x00, 0x00, /* movl $1, %eax */
+    0xCD, 0x80            /* int $0x80 */
+};
+
+static struct options options;
+
+int main(int argc, char **argv)
+{
+       int c, fd, ret;
+
+       while ((c = getopt(argc, argv, "o:h")) != -1) {
+               switch (c) {
+               case 'o':
+                       options.output = optarg;
+                       break;
+               case 'h':
+                       printf("Usage: genelf -o output_file [-h]\n");
+                       return 0;
+               default:
+                       errx(1, "unknown option");
+               }
+       }
+
+       fd = open(options.output, O_CREAT|O_TRUNC|O_RDWR, 0666);
+       if (fd == -1)
+               err(1, "cannot create file %s", options.output);
+
+       ret = jit_write_elf(fd, "main", x86_code, sizeof(x86_code));
+       close(fd);
+
+       if (ret != 0)
+               unlink(options.output);
+
+       return ret;
+}
+#endif
diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h

new file mode 100644 (file)

index 0000000..45bf9c6
--- /dev/null
+++ b/tools/perf/util/genelf.h
@@ -0,0 +1,67 @@
+#ifndef __GENELF_H__
+#define __GENELF_H__
+
+/* genelf.c */
+extern int jit_write_elf(int fd, uint64_t code_addr, const char *sym,
+                        const void *code, int csize,
+                        void *debug, int nr_debug_entries);
+/* genelf_debug.c */
+extern int jit_add_debug_info(Elf *e, uint64_t code_addr,
+                             void *debug, int nr_debug_entries);
+
+#if   defined(__arm__)
+#define GEN_ELF_ARCH   EM_ARM
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS32
+#elif defined(__aarch64__)
+#define GEN_ELF_ARCH   EM_AARCH64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__x86_64__)
+#define GEN_ELF_ARCH   EM_X86_64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__i386__)
+#define GEN_ELF_ARCH   EM_386
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS32
+#elif defined(__ppcle__)
+#define GEN_ELF_ARCH   EM_PPC
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__powerpc__)
+#define GEN_ELF_ARCH   EM_PPC64
+#define GEN_ELF_ENDIAN ELFDATA2MSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__powerpcle__)
+#define GEN_ELF_ARCH   EM_PPC64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#else
+#error "unsupported architecture"
+#endif
+
+#if GEN_ELF_CLASS == ELFCLASS64
+#define elf_newehdr    elf64_newehdr
+#define elf_getshdr    elf64_getshdr
+#define Elf_Ehdr       Elf64_Ehdr
+#define Elf_Shdr       Elf64_Shdr
+#define Elf_Sym                Elf64_Sym
+#define ELF_ST_TYPE(a) ELF64_ST_TYPE(a)
+#define ELF_ST_BIND(a) ELF64_ST_BIND(a)
+#define ELF_ST_VIS(a)  ELF64_ST_VISIBILITY(a)
+#else
+#define elf_newehdr    elf32_newehdr
+#define elf_getshdr    elf32_getshdr
+#define Elf_Ehdr       Elf32_Ehdr
+#define Elf_Shdr       Elf32_Shdr
+#define Elf_Sym                Elf32_Sym
+#define ELF_ST_TYPE(a) ELF32_ST_TYPE(a)
+#define ELF_ST_BIND(a) ELF32_ST_BIND(a)
+#define ELF_ST_VIS(a)  ELF32_ST_VISIBILITY(a)
+#endif
+
+/* The .text section is directly after the ELF header */
+#define GEN_ELF_TEXT_OFFSET sizeof(Elf_Ehdr)
+
+#endif
diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c

new file mode 100644 (file)

index 0000000..5980f7d
--- /dev/null
+++ b/tools/perf/util/genelf_debug.c
@@ -0,0 +1,610 @@
+/*
+ * genelf_debug.c
+ * Copyright (C) 2015, Google, Inc
+ *
+ * Contributed by:
+ *     Stephane Eranian <eranian@google.com>
+ *
+ * Released under the GPL v2.
+ *
+ * based on GPLv2 source code from Oprofile
+ * @remark Copyright 2007 OProfile authors
+ * @author Philippe Elie
+ */
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define BUFFER_EXT_DFL_SIZE    (4 * 1024)
+
+typedef uint32_t uword;
+typedef uint16_t uhalf;
+typedef int32_t  sword;
+typedef int16_t  shalf;
+typedef uint8_t  ubyte;
+typedef int8_t   sbyte;
+
+struct buffer_ext {
+       size_t cur_pos;
+       size_t max_sz;
+       void *data;
+};
+
+static void
+buffer_ext_dump(struct buffer_ext *be, const char *msg)
+{
+       size_t i;
+       warnx("DUMP for %s", msg);
+       for (i = 0 ; i < be->cur_pos; i++)
+               warnx("%4zu 0x%02x", i, (((char *)be->data)[i]) & 0xff);
+}
+
+static inline int
+buffer_ext_add(struct buffer_ext *be, void *addr, size_t sz)
+{
+       void *tmp;
+       size_t be_sz = be->max_sz;
+
+retry:
+       if ((be->cur_pos + sz) < be_sz) {
+               memcpy(be->data + be->cur_pos, addr, sz);
+               be->cur_pos += sz;
+               return 0;
+       }
+
+       if (!be_sz)
+               be_sz = BUFFER_EXT_DFL_SIZE;
+       else
+               be_sz <<= 1;
+
+       tmp = realloc(be->data, be_sz);
+       if (!tmp)
+               return -1;
+
+       be->data   = tmp;
+       be->max_sz = be_sz;
+
+       goto retry;
+}
+
+static void
+buffer_ext_init(struct buffer_ext *be)
+{
+       be->data = NULL;
+       be->cur_pos = 0;
+       be->max_sz = 0;
+}
+
+static inline size_t
+buffer_ext_size(struct buffer_ext *be)
+{
+       return be->cur_pos;
+}
+
+static inline void *
+buffer_ext_addr(struct buffer_ext *be)
+{
+       return be->data;
+}
+
+struct debug_line_header {
+       // Not counting this field
+       uword total_length;
+       // version number (2 currently)
+       uhalf version;
+       // relative offset from next field to
+       // program statement
+       uword prolog_length;
+       ubyte minimum_instruction_length;
+       ubyte default_is_stmt;
+       // line_base - see DWARF 2 specs
+       sbyte line_base;
+       // line_range - see DWARF 2 specs
+       ubyte line_range;
+       // number of opcode + 1
+       ubyte opcode_base;
+       /* follow the array of opcode args nr: ubytes [nr_opcode_base] */
+       /* follow the search directories index, zero terminated string
+        * terminated by an empty string.
+        */
+       /* follow an array of { filename, LEB128, LEB128, LEB128 }, first is
+        * the directory index entry, 0 means current directory, then mtime
+        * and filesize, last entry is followed by en empty string.
+        */
+       /* follow the first program statement */
+} __attribute__((packed));
+
+/* DWARF 2 spec talk only about one possible compilation unit header while
+ * binutils can handle two flavours of dwarf 2, 32 and 64 bits, this is not
+ * related to the used arch, an ELF 32 can hold more than 4 Go of debug
+ * information. For now we handle only DWARF 2 32 bits comp unit. It'll only
+ * become a problem if we generate more than 4GB of debug information.
+ */
+struct compilation_unit_header {
+       uword total_length;
+       uhalf version;
+       uword debug_abbrev_offset;
+       ubyte pointer_size;
+} __attribute__((packed));
+
+#define DW_LNS_num_opcode (DW_LNS_set_isa + 1)
+
+/* field filled at run time are marked with -1 */
+static struct debug_line_header const default_debug_line_header = {
+       .total_length = -1,
+       .version = 2,
+       .prolog_length = -1,
+       .minimum_instruction_length = 1,        /* could be better when min instruction size != 1 */
+       .default_is_stmt = 1,   /* we don't take care about basic block */
+       .line_base = -5,        /* sensible value for line base ... */
+       .line_range = -14,     /* ... and line range are guessed statically */
+       .opcode_base = DW_LNS_num_opcode
+};
+
+static ubyte standard_opcode_length[] =
+{
+       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1
+};
+#if 0
+{
+       [DW_LNS_advance_pc]   = 1,
+       [DW_LNS_advance_line] = 1,
+       [DW_LNS_set_file] =  1,
+       [DW_LNS_set_column] = 1,
+       [DW_LNS_fixed_advance_pc] = 1,
+       [DW_LNS_set_isa] = 1,
+};
+#endif
+
+/* field filled at run time are marked with -1 */
+static struct compilation_unit_header default_comp_unit_header = {
+       .total_length = -1,
+       .version = 2,
+       .debug_abbrev_offset = 0,     /* we reuse the same abbrev entries for all comp unit */
+       .pointer_size = sizeof(void *)
+};
+
+static void emit_uword(struct buffer_ext *be, uword data)
+{
+       buffer_ext_add(be, &data, sizeof(uword));
+}
+
+static void emit_string(struct buffer_ext *be, const char *s)
+{
+       buffer_ext_add(be, (void *)s, strlen(s) + 1);
+}
+
+static void emit_unsigned_LEB128(struct buffer_ext *be,
+                                unsigned long data)
+{
+       do {
+               ubyte cur = data & 0x7F;
+               data >>= 7;
+               if (data)
+                       cur |= 0x80;
+               buffer_ext_add(be, &cur, 1);
+       } while (data);
+}
+
+static void emit_signed_LEB128(struct buffer_ext *be, long data)
+{
+       int more = 1;
+       int negative = data < 0;
+       int size = sizeof(long) * CHAR_BIT;
+       while (more) {
+               ubyte cur = data & 0x7F;
+               data >>= 7;
+               if (negative)
+                       data |= - (1 << (size - 7));
+               if ((data == 0 && !(cur & 0x40)) ||
+                   (data == -1l && (cur & 0x40)))
+                       more = 0;
+               else
+                       cur |= 0x80;
+               buffer_ext_add(be, &cur, 1);
+       }
+}
+
+static void emit_extended_opcode(struct buffer_ext *be, ubyte opcode,
+                                void *data, size_t data_len)
+{
+       buffer_ext_add(be, (char *)"", 1);
+
+       emit_unsigned_LEB128(be, data_len + 1);
+
+       buffer_ext_add(be, &opcode, 1);
+       buffer_ext_add(be, data, data_len);
+}
+
+static void emit_opcode(struct buffer_ext *be, ubyte opcode)
+{
+       buffer_ext_add(be, &opcode, 1);
+}
+
+static void emit_opcode_signed(struct buffer_ext  *be,
+                              ubyte opcode, long data)
+{
+       buffer_ext_add(be, &opcode, 1);
+       emit_signed_LEB128(be, data);
+}
+
+static void emit_opcode_unsigned(struct buffer_ext *be, ubyte opcode,
+                                unsigned long data)
+{
+       buffer_ext_add(be, &opcode, 1);
+       emit_unsigned_LEB128(be, data);
+}
+
+static void emit_advance_pc(struct buffer_ext *be, unsigned long delta_pc)
+{
+       emit_opcode_unsigned(be, DW_LNS_advance_pc, delta_pc);
+}
+
+static void emit_advance_lineno(struct buffer_ext  *be, long delta_lineno)
+{
+       emit_opcode_signed(be, DW_LNS_advance_line, delta_lineno);
+}
+
+static void emit_lne_end_of_sequence(struct buffer_ext *be)
+{
+       emit_extended_opcode(be, DW_LNE_end_sequence, NULL, 0);
+}
+
+static void emit_set_file(struct buffer_ext *be, unsigned long idx)
+{
+       emit_opcode_unsigned(be, DW_LNS_set_file, idx);
+}
+
+static void emit_lne_define_filename(struct buffer_ext *be,
+                                    const char *filename)
+{
+       buffer_ext_add(be, (void *)"", 1);
+
+       /* LNE field, strlen(filename) + zero termination, 3 bytes for: the dir entry, timestamp, filesize */
+       emit_unsigned_LEB128(be, strlen(filename) + 5);
+       emit_opcode(be, DW_LNE_define_file);
+       emit_string(be, filename);
+       /* directory index 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+       /* last modification date on file 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+       /* filesize 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void emit_lne_set_address(struct buffer_ext *be,
+                                void *address)
+{
+       emit_extended_opcode(be, DW_LNE_set_address, &address, sizeof(unsigned long));
+}
+
+static ubyte get_special_opcode(struct debug_entry *ent,
+                               unsigned int last_line,
+                               unsigned long last_vma)
+{
+       unsigned int temp;
+       unsigned long delta_addr;
+
+       /*
+        * delta from line_base
+        */
+       temp = (ent->lineno - last_line) - default_debug_line_header.line_base;
+
+       if (temp >= default_debug_line_header.line_range)
+               return 0;
+
+       /*
+        * delta of addresses
+        */
+       delta_addr = (ent->addr - last_vma) / default_debug_line_header.minimum_instruction_length;
+
+       /* This is not sufficient to ensure opcode will be in [0-256] but
+        * sufficient to ensure when summing with the delta lineno we will
+        * not overflow the unsigned long opcode */
+
+       if (delta_addr <= 256 / default_debug_line_header.line_range) {
+               unsigned long opcode = temp +
+                       (delta_addr * default_debug_line_header.line_range) +
+                       default_debug_line_header.opcode_base;
+
+               return opcode <= 255 ? opcode : 0;
+       }
+       return 0;
+}
+
+static void emit_lineno_info(struct buffer_ext *be,
+                            struct debug_entry *ent, size_t nr_entry,
+                            unsigned long code_addr)
+{
+       size_t i;
+
+       /*
+        * Machine state at start of a statement program
+        * address = 0
+        * file    = 1
+        * line    = 1
+        * column  = 0
+        * is_stmt = default_is_stmt as given in the debug_line_header
+        * basic block = 0
+        * end sequence = 0
+        */
+
+       /* start state of the state machine we take care of */
+       unsigned long last_vma = code_addr;
+       char const  *cur_filename = NULL;
+       unsigned long cur_file_idx = 0;
+       int last_line = 1;
+
+       emit_lne_set_address(be, (void *)code_addr);
+
+       for (i = 0; i < nr_entry; i++, ent = debug_entry_next(ent)) {
+               int need_copy = 0;
+               ubyte special_opcode;
+
+               /*
+                * check if filename changed, if so add it
+                */
+               if (!cur_filename || strcmp(cur_filename, ent->name)) {
+                       emit_lne_define_filename(be, ent->name);
+                       cur_filename = ent->name;
+                       emit_set_file(be, ++cur_file_idx);
+                       need_copy = 1;
+               }
+
+               special_opcode = get_special_opcode(ent, last_line, last_vma);
+               if (special_opcode != 0) {
+                       last_line = ent->lineno;
+                       last_vma  = ent->addr;
+                       emit_opcode(be, special_opcode);
+               } else {
+                       /*
+                        * lines differ, emit line delta
+                        */
+                       if (last_line != ent->lineno) {
+                               emit_advance_lineno(be, ent->lineno - last_line);
+                               last_line = ent->lineno;
+                               need_copy = 1;
+                       }
+                       /*
+                        * addresses differ, emit address delta
+                        */
+                       if (last_vma != ent->addr) {
+                               emit_advance_pc(be, ent->addr - last_vma);
+                               last_vma = ent->addr;
+                               need_copy = 1;
+                       }
+                       /*
+                        * add new row to matrix
+                        */
+                       if (need_copy)
+                               emit_opcode(be, DW_LNS_copy);
+               }
+       }
+}
+
+static void add_debug_line(struct buffer_ext *be,
+       struct debug_entry *ent, size_t nr_entry,
+       unsigned long code_addr)
+{
+       struct debug_line_header * dbg_header;
+       size_t old_size;
+
+       old_size = buffer_ext_size(be);
+
+       buffer_ext_add(be, (void *)&default_debug_line_header,
+                sizeof(default_debug_line_header));
+
+       buffer_ext_add(be, &standard_opcode_length,  sizeof(standard_opcode_length));
+
+       // empty directory entry
+       buffer_ext_add(be, (void *)"", 1);
+
+       // empty filename directory
+       buffer_ext_add(be, (void *)"", 1);
+
+       dbg_header = buffer_ext_addr(be) + old_size;
+       dbg_header->prolog_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct debug_line_header, minimum_instruction_length);
+
+       emit_lineno_info(be, ent, nr_entry, code_addr);
+
+       emit_lne_end_of_sequence(be);
+
+       dbg_header = buffer_ext_addr(be) + old_size;
+       dbg_header->total_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct debug_line_header, version);
+}
+
+static void
+add_debug_abbrev(struct buffer_ext *be)
+{
+        emit_unsigned_LEB128(be, 1);
+        emit_unsigned_LEB128(be, DW_TAG_compile_unit);
+        emit_unsigned_LEB128(be, DW_CHILDREN_yes);
+        emit_unsigned_LEB128(be, DW_AT_stmt_list);
+        emit_unsigned_LEB128(be, DW_FORM_data4);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void
+add_compilation_unit(struct buffer_ext *be,
+                    size_t offset_debug_line)
+{
+       struct compilation_unit_header *comp_unit_header;
+       size_t old_size = buffer_ext_size(be);
+
+       buffer_ext_add(be, &default_comp_unit_header,
+                      sizeof(default_comp_unit_header));
+
+       emit_unsigned_LEB128(be, 1);
+       emit_uword(be, offset_debug_line);
+
+       comp_unit_header = buffer_ext_addr(be) + old_size;
+       comp_unit_header->total_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct compilation_unit_header, version);
+}
+
+static int
+jit_process_debug_info(uint64_t code_addr,
+                      void *debug, int nr_debug_entries,
+                      struct buffer_ext *dl,
+                      struct buffer_ext *da,
+                      struct buffer_ext *di)
+{
+       struct debug_entry *ent = debug;
+       int i;
+
+       for (i = 0; i < nr_debug_entries; i++) {
+               ent->addr = ent->addr - code_addr;
+               ent = debug_entry_next(ent);
+       }
+       add_compilation_unit(di, buffer_ext_size(dl));
+       add_debug_line(dl, debug, nr_debug_entries, 0);
+       add_debug_abbrev(da);
+       if (0) buffer_ext_dump(da, "abbrev");
+
+       return 0;
+}
+
+int
+jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_entries)
+{
+       Elf_Data *d;
+       Elf_Scn *scn;
+       Elf_Shdr *shdr;
+       struct buffer_ext dl, di, da;
+       int ret;
+
+       buffer_ext_init(&dl);
+       buffer_ext_init(&di);
+       buffer_ext_init(&da);
+
+       ret = jit_process_debug_info(code_addr, debug, nr_debug_entries, &dl, &da, &di);
+       if (ret)
+               return -1;
+       /*
+        * setup .debug_line section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&dl);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&dl);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 52; /* .debug_line */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup .debug_info section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&di);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&di);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 64; /* .debug_info */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup .debug_abbrev section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&da);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&da);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 76; /* .debug_info */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * now we update the ELF image with all the sections
+        */
+       if (elf_update(e, ELF_C_WRITE) < 0) {
+               warnx("elf_update debug failed");
+               return -1;
+       }
+       return 0;
+}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index f50b7235ecb6558d167a475bf4199e8b05f52143..73e38e472ecd7771695b7ac2d91829a35529f419 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -23,6 +23,8 @@
  #include "strbuf.h"
  #include "build-id.h"
  #include "data.h"
+#include <api/fs/fs.h>
+#include "asm/bug.h"
  
  /*
   * magic2 = "PERFILE2"
@@ -868,6 +870,199 @@ static int write_auxtrace(int fd, struct perf_header *h,
         return err;
  }
  
+static int cpu_cache_level__sort(const void *a, const void *b)
+{
+       struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a;
+       struct cpu_cache_level *cache_b = (struct cpu_cache_level *)b;
+
+       return cache_a->level - cache_b->level;
+}
+
+static bool cpu_cache_level__cmp(struct cpu_cache_level *a, struct cpu_cache_level *b)
+{
+       if (a->level != b->level)
+               return false;
+
+       if (a->line_size != b->line_size)
+               return false;
+
+       if (a->sets != b->sets)
+               return false;
+
+       if (a->ways != b->ways)
+               return false;
+
+       if (strcmp(a->type, b->type))
+               return false;
+
+       if (strcmp(a->size, b->size))
+               return false;
+
+       if (strcmp(a->map, b->map))
+               return false;
+
+       return true;
+}
+
+static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 level)
+{
+       char path[PATH_MAX], file[PATH_MAX];
+       struct stat st;
+       size_t len;
+
+       scnprintf(path, PATH_MAX, "devices/system/cpu/cpu%d/cache/index%d/", cpu, level);
+       scnprintf(file, PATH_MAX, "%s/%s", sysfs__mountpoint(), path);
+
+       if (stat(file, &st))
+               return 1;
+
+       scnprintf(file, PATH_MAX, "%s/level", path);
+       if (sysfs__read_int(file, (int *) &cache->level))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/coherency_line_size", path);
+       if (sysfs__read_int(file, (int *) &cache->line_size))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/number_of_sets", path);
+       if (sysfs__read_int(file, (int *) &cache->sets))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/ways_of_associativity", path);
+       if (sysfs__read_int(file, (int *) &cache->ways))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/type", path);
+       if (sysfs__read_str(file, &cache->type, &len))
+               return -1;
+
+       cache->type[len] = 0;
+       cache->type = rtrim(cache->type);
+
+       scnprintf(file, PATH_MAX, "%s/size", path);
+       if (sysfs__read_str(file, &cache->size, &len)) {
+               free(cache->type);
+               return -1;
+       }
+
+       cache->size[len] = 0;
+       cache->size = rtrim(cache->size);
+
+       scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
+       if (sysfs__read_str(file, &cache->map, &len)) {
+               free(cache->map);
+               free(cache->type);
+               return -1;
+       }
+
+       cache->map[len] = 0;
+       cache->map = rtrim(cache->map);
+       return 0;
+}
+
+static void cpu_cache_level__fprintf(FILE *out, struct cpu_cache_level *c)
+{
+       fprintf(out, "L%d %-15s %8s [%s]\n", c->level, c->type, c->size, c->map);
+}
+
+static int build_caches(struct cpu_cache_level caches[], u32 size, u32 *cntp)
+{
+       u32 i, cnt = 0;
+       long ncpus;
+       u32 nr, cpu;
+       u16 level;
+
+       ncpus = sysconf(_SC_NPROCESSORS_CONF);
+       if (ncpus < 0)
+               return -1;
+
+       nr = (u32)(ncpus & UINT_MAX);
+
+       for (cpu = 0; cpu < nr; cpu++) {
+               for (level = 0; level < 10; level++) {
+                       struct cpu_cache_level c;
+                       int err;
+
+                       err = cpu_cache_level__read(&c, cpu, level);
+                       if (err < 0)
+                               return err;
+
+                       if (err == 1)
+                               break;
+
+                       for (i = 0; i < cnt; i++) {
+                               if (cpu_cache_level__cmp(&c, &caches[i]))
+                                       break;
+                       }
+
+                       if (i == cnt)
+                               caches[cnt++] = c;
+                       else
+                               cpu_cache_level__free(&c);
+
+                       if (WARN_ONCE(cnt == size, "way too many cpu caches.."))
+                               goto out;
+               }
+       }
+ out:
+       *cntp = cnt;
+       return 0;
+}
+
+#define MAX_CACHES 2000
+
+static int write_cache(int fd, struct perf_header *h __maybe_unused,
+                         struct perf_evlist *evlist __maybe_unused)
+{
+       struct cpu_cache_level caches[MAX_CACHES];
+       u32 cnt = 0, i, version = 1;
+       int ret;
+
+       ret = build_caches(caches, MAX_CACHES, &cnt);
+       if (ret)
+               goto out;
+
+       qsort(&caches, cnt, sizeof(struct cpu_cache_level), cpu_cache_level__sort);
+
+       ret = do_write(fd, &version, sizeof(u32));
+       if (ret < 0)
+               goto out;
+
+       ret = do_write(fd, &cnt, sizeof(u32));
+       if (ret < 0)
+               goto out;
+
+       for (i = 0; i < cnt; i++) {
+               struct cpu_cache_level *c = &caches[i];
+
+               #define _W(v)                                   \
+                       ret = do_write(fd, &c->v, sizeof(u32)); \
+                       if (ret < 0)                            \
+                               goto out;
+
+               _W(level)
+               _W(line_size)
+               _W(sets)
+               _W(ways)
+               #undef _W
+
+               #define _W(v)                                           \
+                       ret = do_write_string(fd, (const char *) c->v); \
+                       if (ret < 0)                                    \
+                               goto out;
+
+               _W(type)
+               _W(size)
+               _W(map)
+               #undef _W
+       }
+
+out:
+       for (i = 0; i < cnt; i++)
+               cpu_cache_level__free(&caches[i]);
+       return ret;
+}
+
  static int write_stat(int fd __maybe_unused,
                       struct perf_header *h __maybe_unused,
                       struct perf_evlist *evlist __maybe_unused)
@@ -1172,6 +1367,18 @@ static void print_stat(struct perf_header *ph __maybe_unused,
         fprintf(fp, "# contains stat data\n");
  }
  
+static void print_cache(struct perf_header *ph __maybe_unused,
+                       int fd __maybe_unused, FILE *fp __maybe_unused)
+{
+       int i;
+
+       fprintf(fp, "# CPU cache info:\n");
+       for (i = 0; i < ph->env.caches_cnt; i++) {
+               fprintf(fp, "#  ");
+               cpu_cache_level__fprintf(fp, &ph->env.caches[i]);
+       }
+}
+
  static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
                                FILE *fp)
  {
@@ -1920,6 +2127,68 @@ static int process_auxtrace(struct perf_file_section *section,
         return err;
  }
  
+static int process_cache(struct perf_file_section *section __maybe_unused,
+                        struct perf_header *ph __maybe_unused, int fd __maybe_unused,
+                        void *data __maybe_unused)
+{
+       struct cpu_cache_level *caches;
+       u32 cnt, i, version;
+
+       if (readn(fd, &version, sizeof(version)) != sizeof(version))
+               return -1;
+
+       if (ph->needs_swap)
+               version = bswap_32(version);
+
+       if (version != 1)
+               return -1;
+
+       if (readn(fd, &cnt, sizeof(cnt)) != sizeof(cnt))
+               return -1;
+
+       if (ph->needs_swap)
+               cnt = bswap_32(cnt);
+
+       caches = zalloc(sizeof(*caches) * cnt);
+       if (!caches)
+               return -1;
+
+       for (i = 0; i < cnt; i++) {
+               struct cpu_cache_level c;
+
+               #define _R(v)                                           \
+                       if (readn(fd, &c.v, sizeof(u32)) != sizeof(u32))\
+                               goto out_free_caches;                   \
+                       if (ph->needs_swap)                             \
+                               c.v = bswap_32(c.v);                    \
+
+               _R(level)
+               _R(line_size)
+               _R(sets)
+               _R(ways)
+               #undef _R
+
+               #define _R(v)                           \
+                       c.v = do_read_string(fd, ph);   \
+                       if (!c.v)                       \
+                               goto out_free_caches;
+
+               _R(type)
+               _R(size)
+               _R(map)
+               #undef _R
+
+               caches[i] = c;
+       }
+
+       ph->env.caches = caches;
+       ph->env.caches_cnt = cnt;
+       return 0;
+out_free_caches:
+       free(caches);
+       return -1;
+}
+
  struct feature_ops {
         int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
         void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1962,6 +2231,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
         FEAT_OPP(HEADER_GROUP_DESC,     group_desc),
         FEAT_OPP(HEADER_AUXTRACE,       auxtrace),
         FEAT_OPA(HEADER_STAT,           stat),
+       FEAT_OPF(HEADER_CACHE,          cache),
  };
  
  struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h

index cff9892452ee393764726a12895ad95d36f766fe..3d87ca823c0ae55591e753f6f928802e831c0e3c 100644 (file)
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -32,6 +32,7 @@ enum {
         HEADER_GROUP_DESC,
         HEADER_AUXTRACE,
         HEADER_STAT,
+       HEADER_CACHE,
         HEADER_LAST_FEATURE,
         HEADER_FEAT_BITS        = 256,
  };
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c

index dc1e41c9b054b186c0d9349d97e9d0f467099ac0..43a98a4dc1e1e90c7079fba26929022ec0959a31 100644 (file)
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -6,7 +6,8 @@
  static int autocorrect;
  static struct cmdnames aliases;
  
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+static int perf_unknown_cmd_config(const char *var, const char *value,
+                                  void *cb __maybe_unused)
  {
         if (!strcmp(var, "help.autocorrect"))
                 autocorrect = perf_config_int(var,value);
@@ -14,7 +15,7 @@ static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
         if (!prefixcmp(var, "alias."))
                 add_cmdname(&aliases, var + 6, strlen(var + 6));
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int levenshtein_compare(const void *p1, const void *p2)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index 68a7612019dc3c3be315604693b68170183044d6..290b3cbf68772de883bff4fee5c2294234dbefd3 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -179,6 +179,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
         if (h->transaction)
                 hists__new_col_len(hists, HISTC_TRANSACTION,
                                    hist_entry__transaction_len());
+
+       if (h->trace_output)
+               hists__new_col_len(hists, HISTC_TRACE, strlen(h->trace_output));
  }
  
  void hists__output_recalc_col_len(struct hists *hists, int max_rows)
@@ -245,6 +248,8 @@ static void he_stat__decay(struct he_stat *he_stat)
         /* XXX need decay for weight too? */
  }
  
+static void hists__delete_entry(struct hists *hists, struct hist_entry *he);
+
  static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
  {
         u64 prev_period = he->stat.period;
@@ -260,21 +265,45 @@ static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
  
         diff = prev_period - he->stat.period;
  
-       hists->stats.total_period -= diff;
-       if (!he->filtered)
-               hists->stats.total_non_filtered_period -= diff;
+       if (!he->depth) {
+               hists->stats.total_period -= diff;
+               if (!he->filtered)
+                       hists->stats.total_non_filtered_period -= diff;
+       }
+
+       if (!he->leaf) {
+               struct hist_entry *child;
+               struct rb_node *node = rb_first(&he->hroot_out);
+               while (node) {
+                       child = rb_entry(node, struct hist_entry, rb_node);
+                       node = rb_next(node);
+
+                       if (hists__decay_entry(hists, child))
+                               hists__delete_entry(hists, child);
+               }
+       }
  
         return he->stat.period == 0;
  }
  
  static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
  {
-       rb_erase(&he->rb_node, &hists->entries);
+       struct rb_root *root_in;
+       struct rb_root *root_out;
  
-       if (sort__need_collapse)
-               rb_erase(&he->rb_node_in, &hists->entries_collapsed);
-       else
-               rb_erase(&he->rb_node_in, hists->entries_in);
+       if (he->parent_he) {
+               root_in  = &he->parent_he->hroot_in;
+               root_out = &he->parent_he->hroot_out;
+       } else {
+               if (sort__need_collapse)
+                       root_in = &hists->entries_collapsed;
+               else
+                       root_in = hists->entries_in;
+               root_out = &hists->entries;
+       }
+
+       rb_erase(&he->rb_node_in, root_in);
+       rb_erase(&he->rb_node, root_out);
  
         --hists->nr_entries;
         if (!he->filtered)
@@ -393,6 +422,9 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
                 }
                 INIT_LIST_HEAD(&he->pairs.node);
                 thread__get(he->thread);
+
+               if (!symbol_conf.report_hierarchy)
+                       he->leaf = true;
         }
  
         return he;
@@ -405,6 +437,16 @@ static u8 symbol__parent_filter(const struct symbol *parent)
         return 0;
  }
  
+static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
+{
+       if (!symbol_conf.use_callchain)
+               return;
+
+       he->hists->callchain_period += period;
+       if (!he->filtered)
+               he->hists->callchain_non_filtered_period += period;
+}
+
  static struct hist_entry *hists__findnew_entry(struct hists *hists,
                                                struct hist_entry *entry,
                                                struct addr_location *al,
@@ -432,8 +474,10 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
                 cmp = hist_entry__cmp(he, entry);
  
                 if (!cmp) {
-                       if (sample_self)
+                       if (sample_self) {
                                 he_stat__add_period(&he->stat, period, weight);
+                               hist_entry__add_callchain_period(he, period);
+                       }
                         if (symbol_conf.cumulate_callchain)
                                 he_stat__add_period(he->stat_acc, period, weight);
  
@@ -466,6 +510,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
         if (!he)
                 return NULL;
  
+       if (sample_self)
+               hist_entry__add_callchain_period(he, period);
         hists->nr_entries++;
  
         rb_link_node(&he->rb_node_in, parent, p);
@@ -951,10 +997,15 @@ out:
  int64_t
  hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
  {
+       struct hists *hists = left->hists;
         struct perf_hpp_fmt *fmt;
         int64_t cmp = 0;
  
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   !perf_hpp__defined_dynamic_entry(fmt, hists))
+                       continue;
+
                 cmp = fmt->cmp(fmt, left, right);
                 if (cmp)
                         break;
@@ -966,10 +1017,15 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
  int64_t
  hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
  {
+       struct hists *hists = left->hists;
         struct perf_hpp_fmt *fmt;
         int64_t cmp = 0;
  
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   !perf_hpp__defined_dynamic_entry(fmt, hists))
+                       continue;
+
                 cmp = fmt->collapse(fmt, left, right);
                 if (cmp)
                         break;
@@ -1005,18 +1061,251 @@ void hist_entry__delete(struct hist_entry *he)
         free(he);
  }
  
+/*
+ * If this is not the last column, then we need to pad it according to the
+ * pre-calculated max lenght for this column, otherwise don't bother adding
+ * spaces because that would break viewing this with, for instance, 'less',
+ * that would show tons of trailing spaces when a long C++ demangled method
+ * names is sampled.
+*/
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+                                  struct perf_hpp_fmt *fmt, int printed)
+{
+       if (!list_is_last(&fmt->list, &he->hists->hpp_list->fields)) {
+               const int width = fmt->width(fmt, hpp, hists_to_evsel(he->hists));
+               if (printed < width) {
+                       advance_hpp(hpp, printed);
+                       printed = scnprintf(hpp->buf, hpp->size, "%-*s", width - printed, " ");
+               }
+       }
+
+       return printed;
+}
+
  /*
   * collapse the histogram
   */
  
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
-                                 struct rb_root *root, struct hist_entry *he)
+static void hists__apply_filters(struct hists *hists, struct hist_entry *he);
+static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *he,
+                                      enum hist_filter type);
+
+typedef bool (*fmt_chk_fn)(struct perf_hpp_fmt *fmt);
+
+static bool check_thread_entry(struct perf_hpp_fmt *fmt)
+{
+       return perf_hpp__is_thread_entry(fmt) || perf_hpp__is_comm_entry(fmt);
+}
+
+static void hist_entry__check_and_remove_filter(struct hist_entry *he,
+                                               enum hist_filter type,
+                                               fmt_chk_fn check)
+{
+       struct perf_hpp_fmt *fmt;
+       bool type_match = false;
+       struct hist_entry *parent = he->parent_he;
+
+       switch (type) {
+       case HIST_FILTER__THREAD:
+               if (symbol_conf.comm_list == NULL &&
+                   symbol_conf.pid_list == NULL &&
+                   symbol_conf.tid_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__DSO:
+               if (symbol_conf.dso_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__SYMBOL:
+               if (symbol_conf.sym_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__PARENT:
+       case HIST_FILTER__GUEST:
+       case HIST_FILTER__HOST:
+       case HIST_FILTER__SOCKET:
+       default:
+               return;
+       }
+
+       /* if it's filtered by own fmt, it has to have filter bits */
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               if (check(fmt)) {
+                       type_match = true;
+                       break;
+               }
+       }
+
+       if (type_match) {
+               /*
+                * If the filter is for current level entry, propagate
+                * filter marker to parents.  The marker bit was
+                * already set by default so it only needs to clear
+                * non-filtered entries.
+                */
+               if (!(he->filtered & (1 << type))) {
+                       while (parent) {
+                               parent->filtered &= ~(1 << type);
+                               parent = parent->parent_he;
+                       }
+               }
+       } else {
+               /*
+                * If current entry doesn't have matching formats, set
+                * filter marker for upper level entries.  it will be
+                * cleared if its lower level entries is not filtered.
+                *
+                * For lower-level entries, it inherits parent's
+                * filter bit so that lower level entries of a
+                * non-filtered entry won't set the filter marker.
+                */
+               if (parent == NULL)
+                       he->filtered |= (1 << type);
+               else
+                       he->filtered |= (parent->filtered & (1 << type));
+       }
+}
+
+static void hist_entry__apply_hierarchy_filters(struct hist_entry *he)
+{
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__THREAD,
+                                           check_thread_entry);
+
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__DSO,
+                                           perf_hpp__is_dso_entry);
+
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__SYMBOL,
+                                           perf_hpp__is_sym_entry);
+
+       hists__apply_filters(he->hists, he);
+}
+
+static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
+                                                struct rb_root *root,
+                                                struct hist_entry *he,
+                                                struct hist_entry *parent_he,
+                                                struct perf_hpp_list *hpp_list)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter, *new;
+       struct perf_hpp_fmt *fmt;
+       int64_t cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node_in);
+
+               cmp = 0;
+               perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+                       cmp = fmt->collapse(fmt, iter, he);
+                       if (cmp)
+                               break;
+               }
+
+               if (!cmp) {
+                       he_stat__add_stat(&iter->stat, &he->stat);
+                       return iter;
+               }
+
+               if (cmp < 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+
+       new = hist_entry__new(he, true);
+       if (new == NULL)
+               return NULL;
+
+       hists->nr_entries++;
+
+       /* save related format list for output */
+       new->hpp_list = hpp_list;
+       new->parent_he = parent_he;
+
+       hist_entry__apply_hierarchy_filters(new);
+
+       /* some fields are now passed to 'new' */
+       perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+               if (perf_hpp__is_trace_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+                       he->trace_output = NULL;
+               else
+                       new->trace_output = NULL;
+
+               if (perf_hpp__is_srcline_entry(fmt))
+                       he->srcline = NULL;
+               else
+                       new->srcline = NULL;
+
+               if (perf_hpp__is_srcfile_entry(fmt))
+                       he->srcfile = NULL;
+               else
+                       new->srcfile = NULL;
+       }
+
+       rb_link_node(&new->rb_node_in, parent, p);
+       rb_insert_color(&new->rb_node_in, root);
+       return new;
+}
+
+static int hists__hierarchy_insert_entry(struct hists *hists,
+                                        struct rb_root *root,
+                                        struct hist_entry *he)
+{
+       struct perf_hpp_list_node *node;
+       struct hist_entry *new_he = NULL;
+       struct hist_entry *parent = NULL;
+       int depth = 0;
+       int ret = 0;
+
+       list_for_each_entry(node, &hists->hpp_formats, list) {
+               /* skip period (overhead) and elided columns */
+               if (node->level == 0 || node->skip)
+                       continue;
+
+               /* insert copy of 'he' for each fmt into the hierarchy */
+               new_he = hierarchy_insert_entry(hists, root, he, parent, &node->hpp);
+               if (new_he == NULL) {
+                       ret = -1;
+                       break;
+               }
+
+               root = &new_he->hroot_in;
+               new_he->depth = depth++;
+               parent = new_he;
+       }
+
+       if (new_he) {
+               new_he->leaf = true;
+
+               if (symbol_conf.use_callchain) {
+                       callchain_cursor_reset(&callchain_cursor);
+                       if (callchain_merge(&callchain_cursor,
+                                           new_he->callchain,
+                                           he->callchain) < 0)
+                               ret = -1;
+               }
+       }
+
+       /* 'he' is no longer used */
+       hist_entry__delete(he);
+
+       /* return 0 (or -1) since it already applied filters */
+       return ret;
+}
+
+int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
+                                struct hist_entry *he)
  {
         struct rb_node **p = &root->rb_node;
         struct rb_node *parent = NULL;
         struct hist_entry *iter;
         int64_t cmp;
  
+       if (symbol_conf.report_hierarchy)
+               return hists__hierarchy_insert_entry(hists, root, he);
+
         while (*p != NULL) {
                 parent = *p;
                 iter = rb_entry(parent, struct hist_entry, rb_node_in);
@@ -1024,18 +1313,21 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
                 cmp = hist_entry__collapse(iter, he);
  
                 if (!cmp) {
+                       int ret = 0;
+
                         he_stat__add_stat(&iter->stat, &he->stat);
                         if (symbol_conf.cumulate_callchain)
                                 he_stat__add_stat(iter->stat_acc, he->stat_acc);
  
                         if (symbol_conf.use_callchain) {
                                 callchain_cursor_reset(&callchain_cursor);
-                               callchain_merge(&callchain_cursor,
-                                               iter->callchain,
-                                               he->callchain);
+                               if (callchain_merge(&callchain_cursor,
+                                                   iter->callchain,
+                                                   he->callchain) < 0)
+                                       ret = -1;
                         }
                         hist_entry__delete(he);
-                       return false;
+                       return ret;
                 }
  
                 if (cmp < 0)
@@ -1047,7 +1339,7 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
  
         rb_link_node(&he->rb_node_in, parent, p);
         rb_insert_color(&he->rb_node_in, root);
-       return true;
+       return 1;
  }
  
  struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
@@ -1073,14 +1365,15 @@ static void hists__apply_filters(struct hists *hists, struct hist_entry *he)
         hists__filter_entry_by_socket(hists, he);
  }
  
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
  {
         struct rb_root *root;
         struct rb_node *next;
         struct hist_entry *n;
+       int ret;
  
         if (!sort__need_collapse)
-               return;
+               return 0;
  
         hists->nr_entries = 0;
  
@@ -1095,7 +1388,11 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
                 next = rb_next(&n->rb_node_in);
  
                 rb_erase(&n->rb_node_in, root);
-               if (hists__collapse_insert_entry(hists, &hists->entries_collapsed, n)) {
+               ret = hists__collapse_insert_entry(hists, &hists->entries_collapsed, n);
+               if (ret < 0)
+                       return -1;
+
+               if (ret) {
                         /*
                          * If it wasn't combined with one of the entries already
                          * collapsed, we need to apply the filters that may have
@@ -1106,14 +1403,16 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
                 if (prog)
                         ui_progress__update(prog, 1);
         }
+       return 0;
  }
  
  static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
  {
+       struct hists *hists = a->hists;
         struct perf_hpp_fmt *fmt;
         int64_t cmp = 0;
  
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, a->hists))
                         continue;
  
@@ -1154,6 +1453,113 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h)
         hists->stats.total_period += h->stat.period;
  }
  
+static void hierarchy_recalc_total_periods(struct hists *hists)
+{
+       struct rb_node *node;
+       struct hist_entry *he;
+
+       node = rb_first(&hists->entries);
+
+       hists->stats.total_period = 0;
+       hists->stats.total_non_filtered_period = 0;
+
+       /*
+        * recalculate total period using top-level entries only
+        * since lower level entries only see non-filtered entries
+        * but upper level entries have sum of both entries.
+        */
+       while (node) {
+               he = rb_entry(node, struct hist_entry, rb_node);
+               node = rb_next(node);
+
+               hists->stats.total_period += he->stat.period;
+               if (!he->filtered)
+                       hists->stats.total_non_filtered_period += he->stat.period;
+       }
+}
+
+static void hierarchy_insert_output_entry(struct rb_root *root,
+                                         struct hist_entry *he)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       struct perf_hpp_fmt *fmt;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (hist_entry__sort(he, iter) > 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, root);
+
+       /* update column width of dynamic entry */
+       perf_hpp_list__for_each_sort_list(he->hpp_list, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt))
+                       fmt->sort(fmt, he, NULL);
+       }
+}
+
+static void hists__hierarchy_output_resort(struct hists *hists,
+                                          struct ui_progress *prog,
+                                          struct rb_root *root_in,
+                                          struct rb_root *root_out,
+                                          u64 min_callchain_hits,
+                                          bool use_callchain)
+{
+       struct rb_node *node;
+       struct hist_entry *he;
+
+       *root_out = RB_ROOT;
+       node = rb_first(root_in);
+
+       while (node) {
+               he = rb_entry(node, struct hist_entry, rb_node_in);
+               node = rb_next(node);
+
+               hierarchy_insert_output_entry(root_out, he);
+
+               if (prog)
+                       ui_progress__update(prog, 1);
+
+               if (!he->leaf) {
+                       hists__hierarchy_output_resort(hists, prog,
+                                                      &he->hroot_in,
+                                                      &he->hroot_out,
+                                                      min_callchain_hits,
+                                                      use_callchain);
+                       hists->nr_entries++;
+                       if (!he->filtered) {
+                               hists->nr_non_filtered_entries++;
+                               hists__calc_col_len(hists, he);
+                       }
+
+                       continue;
+               }
+
+               if (!use_callchain)
+                       continue;
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       u64 total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (callchain_param.min_percent / 100);
+               }
+
+               callchain_param.sort(&he->sorted_chain, he->callchain,
+                                    min_callchain_hits, &callchain_param);
+       }
+}
+
  static void __hists__insert_output_entry(struct rb_root *entries,
                                          struct hist_entry *he,
                                          u64 min_callchain_hits,
@@ -1162,10 +1568,20 @@ static void __hists__insert_output_entry(struct rb_root *entries,
         struct rb_node **p = &entries->rb_node;
         struct rb_node *parent = NULL;
         struct hist_entry *iter;
+       struct perf_hpp_fmt *fmt;
+
+       if (use_callchain) {
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       u64 total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
  
-       if (use_callchain)
+                       min_callchain_hits = total * (callchain_param.min_percent / 100);
+               }
                 callchain_param.sort(&he->sorted_chain, he->callchain,
                                       min_callchain_hits, &callchain_param);
+       }
  
         while (*p != NULL) {
                 parent = *p;
@@ -1179,23 +1595,41 @@ static void __hists__insert_output_entry(struct rb_root *entries,
  
         rb_link_node(&he->rb_node, parent, p);
         rb_insert_color(&he->rb_node, entries);
+
+       perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   perf_hpp__defined_dynamic_entry(fmt, he->hists))
+                       fmt->sort(fmt, he, NULL);  /* update column width */
+       }
  }
  
-void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+static void output_resort(struct hists *hists, struct ui_progress *prog,
+                         bool use_callchain)
  {
         struct rb_root *root;
         struct rb_node *next;
         struct hist_entry *n;
+       u64 callchain_total;
         u64 min_callchain_hits;
-       struct perf_evsel *evsel = hists_to_evsel(hists);
-       bool use_callchain;
  
-       if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
-               use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
-       else
-               use_callchain = symbol_conf.use_callchain;
+       callchain_total = hists->callchain_period;
+       if (symbol_conf.filter_relative)
+               callchain_total = hists->callchain_non_filtered_period;
  
-       min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100);
+       min_callchain_hits = callchain_total * (callchain_param.min_percent / 100);
+
+       hists__reset_stats(hists);
+       hists__reset_col_len(hists);
+
+       if (symbol_conf.report_hierarchy) {
+               hists__hierarchy_output_resort(hists, prog,
+                                              &hists->entries_collapsed,
+                                              &hists->entries,
+                                              min_callchain_hits,
+                                              use_callchain);
+               hierarchy_recalc_total_periods(hists);
+               return;
+       }
  
         if (sort__need_collapse)
                 root = &hists->entries_collapsed;
@@ -1205,9 +1639,6 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
         next = rb_first(root);
         hists->entries = RB_ROOT;
  
-       hists__reset_stats(hists);
-       hists__reset_col_len(hists);
-
         while (next) {
                 n = rb_entry(next, struct hist_entry, rb_node_in);
                 next = rb_next(&n->rb_node_in);
@@ -1223,15 +1654,136 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
         }
  }
  
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog)
+{
+       bool use_callchain;
+
+       if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
+               use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
+       else
+               use_callchain = symbol_conf.use_callchain;
+
+       output_resort(evsel__hists(evsel), prog, use_callchain);
+}
+
+void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+{
+       output_resort(hists, prog, symbol_conf.use_callchain);
+}
+
+static bool can_goto_child(struct hist_entry *he, enum hierarchy_move_dir hmd)
+{
+       if (he->leaf || hmd == HMD_FORCE_SIBLING)
+               return false;
+
+       if (he->unfolded || hmd == HMD_FORCE_CHILD)
+               return true;
+
+       return false;
+}
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       while (can_goto_child(he, HMD_NORMAL)) {
+               node = rb_last(&he->hroot_out);
+               he = rb_entry(node, struct hist_entry, rb_node);
+       }
+       return node;
+}
+
+struct rb_node *__rb_hierarchy_next(struct rb_node *node, enum hierarchy_move_dir hmd)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       if (can_goto_child(he, hmd))
+               node = rb_first(&he->hroot_out);
+       else
+               node = rb_next(node);
+
+       while (node == NULL) {
+               he = he->parent_he;
+               if (he == NULL)
+                       break;
+
+               node = rb_next(&he->rb_node);
+       }
+       return node;
+}
+
+struct rb_node *rb_hierarchy_prev(struct rb_node *node)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       node = rb_prev(node);
+       if (node)
+               return rb_hierarchy_last(node);
+
+       he = he->parent_he;
+       if (he == NULL)
+               return NULL;
+
+       return &he->rb_node;
+}
+
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit)
+{
+       struct rb_node *node;
+       struct hist_entry *child;
+       float percent;
+
+       if (he->leaf)
+               return false;
+
+       node = rb_first(&he->hroot_out);
+       child = rb_entry(node, struct hist_entry, rb_node);
+
+       while (node && child->filtered) {
+               node = rb_next(node);
+               child = rb_entry(node, struct hist_entry, rb_node);
+       }
+
+       if (node)
+               percent = hist_entry__get_percent_limit(child);
+       else
+               percent = 0;
+
+       return node && percent >= limit;
+}
+
  static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
                                        enum hist_filter filter)
  {
         h->filtered &= ~(1 << filter);
+
+       if (symbol_conf.report_hierarchy) {
+               struct hist_entry *parent = h->parent_he;
+
+               while (parent) {
+                       he_stat__add_stat(&parent->stat, &h->stat);
+
+                       parent->filtered &= ~(1 << filter);
+
+                       if (parent->filtered)
+                               goto next;
+
+                       /* force fold unfiltered entry for simplicity */
+                       parent->unfolded = false;
+                       parent->has_no_entry = false;
+                       parent->row_offset = 0;
+                       parent->nr_rows = 0;
+next:
+                       parent = parent->parent_he;
+               }
+       }
+
         if (h->filtered)
                 return;
  
         /* force fold unfiltered entry for simplicity */
         h->unfolded = false;
+       h->has_no_entry = false;
         h->row_offset = 0;
         h->nr_rows = 0;
  
@@ -1254,28 +1806,6 @@ static bool hists__filter_entry_by_dso(struct hists *hists,
         return false;
  }
  
-void hists__filter_by_dso(struct hists *hists)
-{
-       struct rb_node *nd;
-
-       hists->stats.nr_non_filtered_samples = 0;
-
-       hists__reset_filter_stats(hists);
-       hists__reset_col_len(hists);
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (symbol_conf.exclude_other && !h->parent)
-                       continue;
-
-               if (hists__filter_entry_by_dso(hists, h))
-                       continue;
-
-               hists__remove_entry_filter(hists, h, HIST_FILTER__DSO);
-       }
-}
-
  static bool hists__filter_entry_by_thread(struct hists *hists,
                                           struct hist_entry *he)
  {
@@ -1288,25 +1818,6 @@ static bool hists__filter_entry_by_thread(struct hists *hists,
         return false;
  }
  
-void hists__filter_by_thread(struct hists *hists)
-{
-       struct rb_node *nd;
-
-       hists->stats.nr_non_filtered_samples = 0;
-
-       hists__reset_filter_stats(hists);
-       hists__reset_col_len(hists);
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (hists__filter_entry_by_thread(hists, h))
-                       continue;
-
-               hists__remove_entry_filter(hists, h, HIST_FILTER__THREAD);
-       }
-}
-
  static bool hists__filter_entry_by_symbol(struct hists *hists,
                                           struct hist_entry *he)
  {
@@ -1320,7 +1831,21 @@ static bool hists__filter_entry_by_symbol(struct hists *hists,
         return false;
  }
  
-void hists__filter_by_symbol(struct hists *hists)
+static bool hists__filter_entry_by_socket(struct hists *hists,
+                                         struct hist_entry *he)
+{
+       if ((hists->socket_filter > -1) &&
+           (he->socket != hists->socket_filter)) {
+               he->filtered |= (1 << HIST_FILTER__SOCKET);
+               return true;
+       }
+
+       return false;
+}
+
+typedef bool (*filter_fn_t)(struct hists *hists, struct hist_entry *he);
+
+static void hists__filter_by_type(struct hists *hists, int type, filter_fn_t filter)
  {
         struct rb_node *nd;
  
@@ -1332,42 +1857,155 @@ void hists__filter_by_symbol(struct hists *hists)
         for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
  
-               if (hists__filter_entry_by_symbol(hists, h))
+               if (filter(hists, h))
                         continue;
  
-               hists__remove_entry_filter(hists, h, HIST_FILTER__SYMBOL);
+               hists__remove_entry_filter(hists, h, type);
         }
  }
  
-static bool hists__filter_entry_by_socket(struct hists *hists,
-                                         struct hist_entry *he)
+static void resort_filtered_entry(struct rb_root *root, struct hist_entry *he)
  {
-       if ((hists->socket_filter > -1) &&
-           (he->socket != hists->socket_filter)) {
-               he->filtered |= (1 << HIST_FILTER__SOCKET);
-               return true;
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       struct rb_root new_root = RB_ROOT;
+       struct rb_node *nd;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (hist_entry__sort(he, iter) > 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
         }
  
-       return false;
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, root);
+
+       if (he->leaf || he->filtered)
+               return;
+
+       nd = rb_first(&he->hroot_out);
+       while (nd) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               nd = rb_next(nd);
+               rb_erase(&h->rb_node, &he->hroot_out);
+
+               resort_filtered_entry(&new_root, h);
+       }
+
+       he->hroot_out = new_root;
  }
  
-void hists__filter_by_socket(struct hists *hists)
+static void hists__filter_hierarchy(struct hists *hists, int type, const void *arg)
  {
         struct rb_node *nd;
+       struct rb_root new_root = RB_ROOT;
  
         hists->stats.nr_non_filtered_samples = 0;
  
         hists__reset_filter_stats(hists);
         hists__reset_col_len(hists);
  
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+       nd = rb_first(&hists->entries);
+       while (nd) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+               int ret;
  
-               if (hists__filter_entry_by_socket(hists, h))
-                       continue;
+               ret = hist_entry__filter(h, type, arg);
  
-               hists__remove_entry_filter(hists, h, HIST_FILTER__SOCKET);
+               /*
+                * case 1. non-matching type
+                * zero out the period, set filter marker and move to child
+                */
+               if (ret < 0) {
+                       memset(&h->stat, 0, sizeof(h->stat));
+                       h->filtered |= (1 << type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_CHILD);
+               }
+               /*
+                * case 2. matched type (filter out)
+                * set filter marker and move to next
+                */
+               else if (ret == 1) {
+                       h->filtered |= (1 << type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+               }
+               /*
+                * case 3. ok (not filtered)
+                * add period to hists and parents, erase the filter marker
+                * and move to next sibling
+                */
+               else {
+                       hists__remove_entry_filter(hists, h, type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+               }
+       }
+
+       hierarchy_recalc_total_periods(hists);
+
+       /*
+        * resort output after applying a new filter since filter in a lower
+        * hierarchy can change periods in a upper hierarchy.
+        */
+       nd = rb_first(&hists->entries);
+       while (nd) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               nd = rb_next(nd);
+               rb_erase(&h->rb_node, &hists->entries);
+
+               resort_filtered_entry(&new_root, h);
         }
+
+       hists->entries = new_root;
+}
+
+void hists__filter_by_thread(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__THREAD,
+                                       hists->thread_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__THREAD,
+                                     hists__filter_entry_by_thread);
+}
+
+void hists__filter_by_dso(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__DSO,
+                                       hists->dso_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__DSO,
+                                     hists__filter_entry_by_dso);
+}
+
+void hists__filter_by_symbol(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__SYMBOL,
+                                       hists->symbol_filter_str);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__SYMBOL,
+                                     hists__filter_entry_by_symbol);
+}
+
+void hists__filter_by_socket(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__SOCKET,
+                                       &hists->socket_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__SOCKET,
+                                     hists__filter_entry_by_socket);
  }
  
  void events_stats__inc(struct events_stats *stats, u32 type)
@@ -1585,7 +2223,7 @@ int perf_hist_config(const char *var, const char *value)
         return 0;
  }
  
-int __hists__init(struct hists *hists)
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list)
  {
         memset(hists, 0, sizeof(*hists));
         hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT;
@@ -1594,6 +2232,8 @@ int __hists__init(struct hists *hists)
         hists->entries = RB_ROOT;
         pthread_mutex_init(&hists->lock, NULL);
         hists->socket_filter = -1;
+       hists->hpp_list = hpp_list;
+       INIT_LIST_HEAD(&hists->hpp_formats);
         return 0;
  }
  
@@ -1622,15 +2262,26 @@ static void hists__delete_all_entries(struct hists *hists)
  static void hists_evsel__exit(struct perf_evsel *evsel)
  {
         struct hists *hists = evsel__hists(evsel);
+       struct perf_hpp_fmt *fmt, *pos;
+       struct perf_hpp_list_node *node, *tmp;
  
         hists__delete_all_entries(hists);
+
+       list_for_each_entry_safe(node, tmp, &hists->hpp_formats, list) {
+               perf_hpp_list__for_each_format_safe(&node->hpp, fmt, pos) {
+                       list_del(&fmt->list);
+                       free(fmt);
+               }
+               list_del(&node->list);
+               free(node);
+       }
  }
  
  static int hists_evsel__init(struct perf_evsel *evsel)
  {
         struct hists *hists = evsel__hists(evsel);
  
-       __hists__init(hists);
+       __hists__init(hists, &perf_hpp_list);
         return 0;
  }
  
@@ -1649,3 +2300,9 @@ int hists__init(void)
  
         return err;
  }
+
+void perf_hpp_list__init(struct perf_hpp_list *list)
+{
+       INIT_LIST_HEAD(&list->fields);
+       INIT_LIST_HEAD(&list->sorts);
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index d4ec4822a1038611a7269aa0df2eb37171a647a6..ead18c82294faf881f1d4bc89be653e27ff5c708 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -66,6 +66,8 @@ struct hists {
         struct rb_root          entries_collapsed;
         u64                     nr_entries;
         u64                     nr_non_filtered_entries;
+       u64                     callchain_period;
+       u64                     callchain_non_filtered_period;
         struct thread           *thread_filter;
         const struct dso        *dso_filter;
         const char              *uid_filter_str;
@@ -75,6 +77,9 @@ struct hists {
         u64                     event_stream;
         u16                     col_len[HISTC_NR_COLS];
         int                     socket_filter;
+       struct perf_hpp_list    *hpp_list;
+       struct list_head        hpp_formats;
+       int                     nr_hpp_node;
  };
  
  struct hist_entry_iter;
@@ -121,15 +126,21 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
  int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
                          int max_stack_depth, void *arg);
  
+struct perf_hpp;
+struct perf_hpp_fmt;
+
  int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
  int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
  int hist_entry__transaction_len(void);
  int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
                               struct hists *hists);
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+                                  struct perf_hpp_fmt *fmt, int printed);
  void hist_entry__delete(struct hist_entry *he);
  
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog);
  void hists__output_resort(struct hists *hists, struct ui_progress *prog);
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
  
  void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
  void hists__delete_entries(struct hists *hists);
@@ -185,10 +196,10 @@ static inline struct hists *evsel__hists(struct perf_evsel *evsel)
  }
  
  int hists__init(void);
-int __hists__init(struct hists *hists);
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
  
  struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
+int hists__collapse_insert_entry(struct hists *hists,
                                   struct rb_root *root, struct hist_entry *he);
  
  struct perf_hpp {
@@ -214,28 +225,64 @@ struct perf_hpp_fmt {
                             struct hist_entry *a, struct hist_entry *b);
         int64_t (*sort)(struct perf_hpp_fmt *fmt,
                         struct hist_entry *a, struct hist_entry *b);
+       bool (*equal)(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
+       void (*free)(struct perf_hpp_fmt *fmt);
  
         struct list_head list;
         struct list_head sort_list;
         bool elide;
         int len;
         int user_len;
+       int idx;
+       int level;
+};
+
+struct perf_hpp_list {
+       struct list_head fields;
+       struct list_head sorts;
  };
  
-extern struct list_head perf_hpp__list;
-extern struct list_head perf_hpp__sort_list;
+extern struct perf_hpp_list perf_hpp_list;
+
+struct perf_hpp_list_node {
+       struct list_head        list;
+       struct perf_hpp_list    hpp;
+       int                     level;
+       bool                    skip;
+};
+
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+                                   struct perf_hpp_fmt *format);
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+                                       struct perf_hpp_fmt *format);
+
+static inline void perf_hpp__column_register(struct perf_hpp_fmt *format)
+{
+       perf_hpp_list__column_register(&perf_hpp_list, format);
+}
+
+static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
+{
+       perf_hpp_list__register_sort_field(&perf_hpp_list, format);
+}
+
+#define perf_hpp_list__for_each_format(_list, format) \
+       list_for_each_entry(format, &(_list)->fields, list)
  
-#define perf_hpp__for_each_format(format) \
-       list_for_each_entry(format, &perf_hpp__list, list)
+#define perf_hpp_list__for_each_format_safe(_list, format, tmp)        \
+       list_for_each_entry_safe(format, tmp, &(_list)->fields, list)
  
-#define perf_hpp__for_each_format_safe(format, tmp)    \
-       list_for_each_entry_safe(format, tmp, &perf_hpp__list, list)
+#define perf_hpp_list__for_each_sort_list(_list, format) \
+       list_for_each_entry(format, &(_list)->sorts, sort_list)
  
-#define perf_hpp__for_each_sort_list(format) \
-       list_for_each_entry(format, &perf_hpp__sort_list, sort_list)
+#define perf_hpp_list__for_each_sort_list_safe(_list, format, tmp)     \
+       list_for_each_entry_safe(format, tmp, &(_list)->sorts, sort_list)
  
-#define perf_hpp__for_each_sort_list_safe(format, tmp) \
-       list_for_each_entry_safe(format, tmp, &perf_hpp__sort_list, sort_list)
+#define hists__for_each_format(hists, format) \
+       perf_hpp_list__for_each_format((hists)->hpp_list, fmt)
+
+#define hists__for_each_sort_list(hists, format) \
+       perf_hpp_list__for_each_sort_list((hists)->hpp_list, fmt)
  
  extern struct perf_hpp_fmt perf_hpp__format[];
  
@@ -254,21 +301,29 @@ enum {
  };
  
  void perf_hpp__init(void);
-void perf_hpp__column_register(struct perf_hpp_fmt *format);
  void perf_hpp__column_unregister(struct perf_hpp_fmt *format);
-void perf_hpp__column_enable(unsigned col);
-void perf_hpp__column_disable(unsigned col);
  void perf_hpp__cancel_cumulate(void);
+void perf_hpp__setup_output_field(struct perf_hpp_list *list);
+void perf_hpp__reset_output_field(struct perf_hpp_list *list);
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+                                 struct perf_evlist *evlist);
  
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format);
-void perf_hpp__setup_output_field(void);
-void perf_hpp__reset_output_field(void);
-void perf_hpp__append_sort_keys(void);
  
  bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
  bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
  bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists);
+bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
+
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
+
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg);
  
  static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
                                          struct hists *hists)
@@ -372,6 +427,7 @@ static inline int script_browse(const char *script_opt __maybe_unused)
  #endif
  
  unsigned int hists__sort_list_width(struct hists *hists);
+unsigned int hists__overhead_width(struct hists *hists);
  
  void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
                           struct perf_sample *sample, bool nonany_branch_mode);
@@ -381,4 +437,26 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
                             const char *arg, int unset __maybe_unused);
  int perf_hist_config(const char *var, const char *value);
  
+void perf_hpp_list__init(struct perf_hpp_list *list);
+
+enum hierarchy_move_dir {
+       HMD_NORMAL,
+       HMD_FORCE_SIBLING,
+       HMD_FORCE_CHILD,
+};
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node);
+struct rb_node *__rb_hierarchy_next(struct rb_node *node,
+                                   enum hierarchy_move_dir hmd);
+struct rb_node *rb_hierarchy_prev(struct rb_node *node);
+
+static inline struct rb_node *rb_hierarchy_next(struct rb_node *node)
+{
+       return __rb_hierarchy_next(node, HMD_NORMAL);
+}
+
+#define HIERARCHY_INDENT  3
+
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit);
+
  #endif /* __PERF_HIST_H */
diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h

new file mode 100644 (file)

index 0000000..a1e99da
--- /dev/null
+++ b/tools/perf/util/jit.h
@@ -0,0 +1,15 @@
+#ifndef __JIT_H__
+#define __JIT_H__
+
+#include <data.h>
+
+extern int jit_process(struct perf_session *session,
+                      struct perf_data_file *output,
+                      struct machine *machine,
+                      char *filename,
+                      pid_t pid,
+                      u64 *nbytes);
+
+extern int jit_inject_record(const char *filename);
+
+#endif /* __JIT_H__ */
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c

new file mode 100644 (file)

index 0000000..cd272cc
--- /dev/null
+++ b/tools/perf/util/jitdump.c
@@ -0,0 +1,697 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <byteswap.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "util.h"
+#include "event.h"
+#include "debug.h"
+#include "evlist.h"
+#include "symbol.h"
+#include "strlist.h"
+#include <elf.h>
+
+#include "session.h"
+#include "jit.h"
+#include "jitdump.h"
+#include "genelf.h"
+#include "../builtin.h"
+
+struct jit_buf_desc {
+       struct perf_data_file *output;
+       struct perf_session *session;
+       struct machine *machine;
+       union jr_entry   *entry;
+       void             *buf;
+       uint64_t         sample_type;
+       size_t           bufsize;
+       FILE             *in;
+       bool             needs_bswap; /* handles cross-endianess */
+       void             *debug_data;
+       size_t           nr_debug_entries;
+       uint32_t         code_load_count;
+       u64              bytes_written;
+       struct rb_root   code_root;
+       char             dir[PATH_MAX];
+};
+
+struct debug_line_info {
+       unsigned long vma;
+       unsigned int lineno;
+       /* The filename format is unspecified, absolute path, relative etc. */
+       char const filename[0];
+};
+
+struct jit_tool {
+       struct perf_tool tool;
+       struct perf_data_file   output;
+       struct perf_data_file   input;
+       u64 bytes_written;
+};
+
+#define hmax(a, b) ((a) > (b) ? (a) : (b))
+#define get_jit_tool(t) (container_of(tool, struct jit_tool, tool))
+
+static int
+jit_emit_elf(char *filename,
+            const char *sym,
+            uint64_t code_addr,
+            const void *code,
+            int csize,
+            void *debug,
+            int nr_debug_entries)
+{
+       int ret, fd;
+
+       if (verbose > 0)
+               fprintf(stderr, "write ELF image %s\n", filename);
+
+       fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0644);
+       if (fd == -1) {
+               pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(errno));
+               return -1;
+       }
+
+        ret = jit_write_elf(fd, code_addr, sym, (const void *)code, csize, debug, nr_debug_entries);
+
+        close(fd);
+
+        if (ret)
+                unlink(filename);
+
+       return ret;
+}
+
+static void
+jit_close(struct jit_buf_desc *jd)
+{
+       if (!(jd && jd->in))
+               return;
+       funlockfile(jd->in);
+       fclose(jd->in);
+       jd->in = NULL;
+}
+
+static int
+jit_validate_events(struct perf_session *session)
+{
+       struct perf_evsel *evsel;
+
+       /*
+        * check that all events use CLOCK_MONOTONIC
+        */
+       evlist__for_each(session->evlist, evsel) {
+               if (evsel->attr.use_clockid == 0 || evsel->attr.clockid != CLOCK_MONOTONIC)
+                       return -1;
+       }
+       return 0;
+}
+
+static int
+jit_open(struct jit_buf_desc *jd, const char *name)
+{
+       struct jitheader header;
+       struct jr_prefix *prefix;
+       ssize_t bs, bsz = 0;
+       void *n, *buf = NULL;
+       int ret, retval = -1;
+
+       jd->in = fopen(name, "r");
+       if (!jd->in)
+               return -1;
+
+       bsz = hmax(sizeof(header), sizeof(*prefix));
+
+       buf = malloc(bsz);
+       if (!buf)
+               goto error;
+
+       /*
+        * protect from writer modifying the file while we are reading it
+        */
+       flockfile(jd->in);
+
+       ret = fread(buf, sizeof(header), 1, jd->in);
+       if (ret != 1)
+               goto error;
+
+       memcpy(&header, buf, sizeof(header));
+
+       if (header.magic != JITHEADER_MAGIC) {
+               if (header.magic != JITHEADER_MAGIC_SW)
+                       goto error;
+               jd->needs_bswap = true;
+       }
+
+       if (jd->needs_bswap) {
+               header.version    = bswap_32(header.version);
+               header.total_size = bswap_32(header.total_size);
+               header.pid        = bswap_32(header.pid);
+               header.elf_mach   = bswap_32(header.elf_mach);
+               header.timestamp  = bswap_64(header.timestamp);
+               header.flags      = bswap_64(header.flags);
+       }
+
+       if (verbose > 2)
+               pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n",
+                       header.version,
+                       header.total_size,
+                       (unsigned long long)header.timestamp,
+                       header.pid,
+                       header.elf_mach);
+
+       if (header.flags & JITDUMP_FLAGS_RESERVED) {
+               pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n",
+                      (unsigned long long)header.flags & JITDUMP_FLAGS_RESERVED);
+               goto error;
+       }
+
+       /*
+        * validate event is using the correct clockid
+        */
+       if (jit_validate_events(jd->session)) {
+               pr_err("error, jitted code must be sampled with perf record -k 1\n");
+               goto error;
+       }
+
+       bs = header.total_size - sizeof(header);
+
+       if (bs > bsz) {
+               n = realloc(buf, bs);
+               if (!n)
+                       goto error;
+               bsz = bs;
+               buf = n;
+               /* read extra we do not know about */
+               ret = fread(buf, bs - bsz, 1, jd->in);
+               if (ret != 1)
+                       goto error;
+       }
+       /*
+        * keep dirname for generating files and mmap records
+        */
+       strcpy(jd->dir, name);
+       dirname(jd->dir);
+
+       return 0;
+error:
+       funlockfile(jd->in);
+       fclose(jd->in);
+       return retval;
+}
+
+static union jr_entry *
+jit_get_next_entry(struct jit_buf_desc *jd)
+{
+       struct jr_prefix *prefix;
+       union jr_entry *jr;
+       void *addr;
+       size_t bs, size;
+       int id, ret;
+
+       if (!(jd && jd->in))
+               return NULL;
+
+       if (jd->buf == NULL) {
+               size_t sz = getpagesize();
+               if (sz < sizeof(*prefix))
+                       sz = sizeof(*prefix);
+
+               jd->buf = malloc(sz);
+               if (jd->buf == NULL)
+                       return NULL;
+
+               jd->bufsize = sz;
+       }
+
+       prefix = jd->buf;
+
+       /*
+        * file is still locked at this point
+        */
+       ret = fread(prefix, sizeof(*prefix), 1, jd->in);
+       if (ret  != 1)
+               return NULL;
+
+       if (jd->needs_bswap) {
+               prefix->id         = bswap_32(prefix->id);
+               prefix->total_size = bswap_32(prefix->total_size);
+               prefix->timestamp  = bswap_64(prefix->timestamp);
+       }
+       id   = prefix->id;
+       size = prefix->total_size;
+
+       bs = (size_t)size;
+       if (bs < sizeof(*prefix))
+               return NULL;
+
+       if (id >= JIT_CODE_MAX) {
+               pr_warning("next_entry: unknown prefix %d, skipping\n", id);
+               return NULL;
+       }
+       if (bs > jd->bufsize) {
+               void *n;
+               n = realloc(jd->buf, bs);
+               if (!n)
+                       return NULL;
+               jd->buf = n;
+               jd->bufsize = bs;
+       }
+
+       addr = ((void *)jd->buf) + sizeof(*prefix);
+
+       ret = fread(addr, bs - sizeof(*prefix), 1, jd->in);
+       if (ret != 1)
+               return NULL;
+
+       jr = (union jr_entry *)jd->buf;
+
+       switch(id) {
+       case JIT_CODE_DEBUG_INFO:
+               if (jd->needs_bswap) {
+                       uint64_t n;
+                       jr->info.code_addr = bswap_64(jr->info.code_addr);
+                       jr->info.nr_entry  = bswap_64(jr->info.nr_entry);
+                       for (n = 0 ; n < jr->info.nr_entry; n++) {
+                               jr->info.entries[n].addr    = bswap_64(jr->info.entries[n].addr);
+                               jr->info.entries[n].lineno  = bswap_32(jr->info.entries[n].lineno);
+                               jr->info.entries[n].discrim = bswap_32(jr->info.entries[n].discrim);
+                       }
+               }
+               break;
+       case JIT_CODE_CLOSE:
+               break;
+       case JIT_CODE_LOAD:
+               if (jd->needs_bswap) {
+                       jr->load.pid       = bswap_32(jr->load.pid);
+                       jr->load.tid       = bswap_32(jr->load.tid);
+                       jr->load.vma       = bswap_64(jr->load.vma);
+                       jr->load.code_addr = bswap_64(jr->load.code_addr);
+                       jr->load.code_size = bswap_64(jr->load.code_size);
+                       jr->load.code_index= bswap_64(jr->load.code_index);
+               }
+               jd->code_load_count++;
+               break;
+       case JIT_CODE_MOVE:
+               if (jd->needs_bswap) {
+                       jr->move.pid           = bswap_32(jr->move.pid);
+                       jr->move.tid           = bswap_32(jr->move.tid);
+                       jr->move.vma           = bswap_64(jr->move.vma);
+                       jr->move.old_code_addr = bswap_64(jr->move.old_code_addr);
+                       jr->move.new_code_addr = bswap_64(jr->move.new_code_addr);
+                       jr->move.code_size     = bswap_64(jr->move.code_size);
+                       jr->move.code_index    = bswap_64(jr->move.code_index);
+               }
+               break;
+       case JIT_CODE_MAX:
+       default:
+               return NULL;
+       }
+       return jr;
+}
+
+static int
+jit_inject_event(struct jit_buf_desc *jd, union perf_event *event)
+{
+       ssize_t size;
+
+       size = perf_data_file__write(jd->output, event, event->header.size);
+       if (size < 0)
+               return -1;
+
+       jd->bytes_written += size;
+       return 0;
+}
+
+static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       struct perf_sample sample;
+       union perf_event *event;
+       struct perf_tool *tool = jd->session->tool;
+       uint64_t code, addr;
+       uintptr_t uaddr;
+       char *filename;
+       struct stat st;
+       size_t size;
+       u16 idr_size;
+       const char *sym;
+       uint32_t count;
+       int ret, csize;
+       pid_t pid, tid;
+       struct {
+               u32 pid, tid;
+               u64 time;
+       } *id;
+
+       pid   = jr->load.pid;
+       tid   = jr->load.tid;
+       csize = jr->load.code_size;
+       addr  = jr->load.code_addr;
+       sym   = (void *)((unsigned long)jr + sizeof(jr->load));
+       code  = (unsigned long)jr + jr->load.p.total_size - csize;
+       count = jr->load.code_index;
+       idr_size = jd->machine->id_hdr_size;
+
+       event = calloc(1, sizeof(*event) + idr_size);
+       if (!event)
+               return -1;
+
+       filename = event->mmap2.filename;
+       size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%u.so",
+                       jd->dir,
+                       pid,
+                       count);
+
+       size++; /* for \0 */
+
+       size = PERF_ALIGN(size, sizeof(u64));
+       uaddr = (uintptr_t)code;
+       ret = jit_emit_elf(filename, sym, addr, (const void *)uaddr, csize, jd->debug_data, jd->nr_debug_entries);
+
+       if (jd->debug_data && jd->nr_debug_entries) {
+               free(jd->debug_data);
+               jd->debug_data = NULL;
+               jd->nr_debug_entries = 0;
+       }
+
+       if (ret) {
+               free(event);
+               return -1;
+       }
+       if (stat(filename, &st))
+               memset(&st, 0, sizeof(stat));
+
+       event->mmap2.header.type = PERF_RECORD_MMAP2;
+       event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+       event->mmap2.header.size = (sizeof(event->mmap2) -
+                       (sizeof(event->mmap2.filename) - size) + idr_size);
+
+       event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+       event->mmap2.start = addr;
+       event->mmap2.len   = csize;
+       event->mmap2.pid   = pid;
+       event->mmap2.tid   = tid;
+       event->mmap2.ino   = st.st_ino;
+       event->mmap2.maj   = major(st.st_dev);
+       event->mmap2.min   = minor(st.st_dev);
+       event->mmap2.prot  = st.st_mode;
+       event->mmap2.flags = MAP_SHARED;
+       event->mmap2.ino_generation = 1;
+
+       id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+       if (jd->sample_type & PERF_SAMPLE_TID) {
+               id->pid  = pid;
+               id->tid  = tid;
+       }
+       if (jd->sample_type & PERF_SAMPLE_TIME)
+               id->time = jr->load.p.timestamp;
+
+       /*
+        * create pseudo sample to induce dso hit increment
+        * use first address as sample address
+        */
+       memset(&sample, 0, sizeof(sample));
+       sample.pid  = pid;
+       sample.tid  = tid;
+       sample.time = id->time;
+       sample.ip   = addr;
+
+       ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+       if (ret)
+               return ret;
+
+       ret = jit_inject_event(jd, event);
+       /*
+        * mark dso as use to generate buildid in the header
+        */
+       if (!ret)
+               build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+       return ret;
+}
+
+static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       struct perf_sample sample;
+       union perf_event *event;
+       struct perf_tool *tool = jd->session->tool;
+       char *filename;
+       size_t size;
+       struct stat st;
+       u16 idr_size;
+       int ret;
+       pid_t pid, tid;
+       struct {
+               u32 pid, tid;
+               u64 time;
+       } *id;
+
+       pid = jr->move.pid;
+       tid =  jr->move.tid;
+       idr_size = jd->machine->id_hdr_size;
+
+       /*
+        * +16 to account for sample_id_all (hack)
+        */
+       event = calloc(1, sizeof(*event) + 16);
+       if (!event)
+               return -1;
+
+       filename = event->mmap2.filename;
+       size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%"PRIu64,
+                jd->dir,
+                pid,
+                jr->move.code_index);
+
+       size++; /* for \0 */
+
+       if (stat(filename, &st))
+               memset(&st, 0, sizeof(stat));
+
+       size = PERF_ALIGN(size, sizeof(u64));
+
+       event->mmap2.header.type = PERF_RECORD_MMAP2;
+       event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+       event->mmap2.header.size = (sizeof(event->mmap2) -
+                       (sizeof(event->mmap2.filename) - size) + idr_size);
+       event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+       event->mmap2.start = jr->move.new_code_addr;
+       event->mmap2.len   = jr->move.code_size;
+       event->mmap2.pid   = pid;
+       event->mmap2.tid   = tid;
+       event->mmap2.ino   = st.st_ino;
+       event->mmap2.maj   = major(st.st_dev);
+       event->mmap2.min   = minor(st.st_dev);
+       event->mmap2.prot  = st.st_mode;
+       event->mmap2.flags = MAP_SHARED;
+       event->mmap2.ino_generation = 1;
+
+       id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+       if (jd->sample_type & PERF_SAMPLE_TID) {
+               id->pid  = pid;
+               id->tid  = tid;
+       }
+       if (jd->sample_type & PERF_SAMPLE_TIME)
+               id->time = jr->load.p.timestamp;
+
+       /*
+        * create pseudo sample to induce dso hit increment
+        * use first address as sample address
+        */
+       memset(&sample, 0, sizeof(sample));
+       sample.pid  = pid;
+       sample.tid  = tid;
+       sample.time = id->time;
+       sample.ip   = jr->move.new_code_addr;
+
+       ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+       if (ret)
+               return ret;
+
+       ret = jit_inject_event(jd, event);
+       if (!ret)
+               build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+       return ret;
+}
+
+static int jit_repipe_debug_info(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       void *data;
+       size_t sz;
+
+       if (!(jd && jr))
+               return -1;
+
+       sz  = jr->prefix.total_size - sizeof(jr->info);
+       data = malloc(sz);
+       if (!data)
+               return -1;
+
+       memcpy(data, &jr->info.entries, sz);
+
+       jd->debug_data       = data;
+
+       /*
+        * we must use nr_entry instead of size here because
+        * we cannot distinguish actual entry from padding otherwise
+        */
+       jd->nr_debug_entries = jr->info.nr_entry;
+
+       return 0;
+}
+
+static int
+jit_process_dump(struct jit_buf_desc *jd)
+{
+       union jr_entry *jr;
+       int ret;
+
+       while ((jr = jit_get_next_entry(jd))) {
+               switch(jr->prefix.id) {
+               case JIT_CODE_LOAD:
+                       ret = jit_repipe_code_load(jd, jr);
+                       break;
+               case JIT_CODE_MOVE:
+                       ret = jit_repipe_code_move(jd, jr);
+                       break;
+               case JIT_CODE_DEBUG_INFO:
+                       ret = jit_repipe_debug_info(jd, jr);
+                       break;
+               default:
+                       ret = 0;
+                       continue;
+               }
+       }
+       return ret;
+}
+
+static int
+jit_inject(struct jit_buf_desc *jd, char *path)
+{
+       int ret;
+
+       if (verbose > 0)
+               fprintf(stderr, "injecting: %s\n", path);
+
+       ret = jit_open(jd, path);
+       if (ret)
+               return -1;
+
+       ret = jit_process_dump(jd);
+
+       jit_close(jd);
+
+       if (verbose > 0)
+               fprintf(stderr, "injected: %s (%d)\n", path, ret);
+
+       return 0;
+}
+
+/*
+ * File must be with pattern .../jit-XXXX.dump
+ * where XXXX is the PID of the process which did the mmap()
+ * as captured in the RECORD_MMAP record
+ */
+static int
+jit_detect(char *mmap_name, pid_t pid)
+ {
+       char *p;
+       char *end = NULL;
+       pid_t pid2;
+
+       if (verbose > 2)
+               fprintf(stderr, "jit marker trying : %s\n", mmap_name);
+       /*
+        * get file name
+        */
+       p = strrchr(mmap_name, '/');
+       if (!p)
+               return -1;
+
+       /*
+        * match prefix
+        */
+       if (strncmp(p, "/jit-", 5))
+               return -1;
+
+       /*
+        * skip prefix
+        */
+       p += 5;
+
+       /*
+        * must be followed by a pid
+        */
+       if (!isdigit(*p))
+               return -1;
+
+       pid2 = (int)strtol(p, &end, 10);
+       if (!end)
+               return -1;
+
+       /*
+        * pid does not match mmap pid
+        * pid==0 in system-wide mode (synthesized)
+        */
+       if (pid && pid2 != pid)
+               return -1;
+       /*
+        * validate suffix
+        */
+       if (strcmp(end, ".dump"))
+               return -1;
+
+       if (verbose > 0)
+               fprintf(stderr, "jit marker found: %s\n", mmap_name);
+
+       return 0;
+}
+
+int
+jit_process(struct perf_session *session,
+           struct perf_data_file *output,
+           struct machine *machine,
+           char *filename,
+           pid_t pid,
+           u64 *nbytes)
+{
+       struct perf_evsel *first;
+       struct jit_buf_desc jd;
+       int ret;
+
+       /*
+        * first, detect marker mmap (i.e., the jitdump mmap)
+        */
+       if (jit_detect(filename, pid))
+               return 0;
+
+       memset(&jd, 0, sizeof(jd));
+
+       jd.session = session;
+       jd.output  = output;
+       jd.machine = machine;
+
+       /*
+        * track sample_type to compute id_all layout
+        * perf sets the same sample type to all events as of now
+        */
+       first = perf_evlist__first(session->evlist);
+       jd.sample_type = first->attr.sample_type;
+
+       *nbytes = 0;
+
+       ret = jit_inject(&jd, filename);
+       if (!ret) {
+               *nbytes = jd.bytes_written;
+               ret = 1;
+       }
+
+       return ret;
+}
diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h

new file mode 100644 (file)

index 0000000..b66c1f5
--- /dev/null
+++ b/tools/perf/util/jitdump.h
@@ -0,0 +1,124 @@
+/*
+ * jitdump.h: jitted code info encapsulation file format
+ *
+ * Adapted from OProfile GPLv2 support jidump.h:
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#ifndef JITDUMP_H
+#define JITDUMP_H
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdint.h>
+
+/* JiTD */
+#define JITHEADER_MAGIC                0x4A695444
+#define JITHEADER_MAGIC_SW     0x4454694A
+
+#define PADDING_8ALIGNED(x) ((((x) + 7) & 7) ^ 7)
+
+#define JITHEADER_VERSION 1
+
+enum jitdump_flags_bits {
+       JITDUMP_FLAGS_MAX_BIT,
+};
+
+#define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \
+                               (~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0)
+
+struct jitheader {
+       uint32_t magic;         /* characters "jItD" */
+       uint32_t version;       /* header version */
+       uint32_t total_size;    /* total size of header */
+       uint32_t elf_mach;      /* elf mach target */
+       uint32_t pad1;          /* reserved */
+       uint32_t pid;           /* JIT process id */
+       uint64_t timestamp;     /* timestamp */
+       uint64_t flags;         /* flags */
+};
+
+enum jit_record_type {
+       JIT_CODE_LOAD           = 0,
+        JIT_CODE_MOVE           = 1,
+       JIT_CODE_DEBUG_INFO     = 2,
+       JIT_CODE_CLOSE          = 3,
+
+       JIT_CODE_MAX,
+};
+
+/* record prefix (mandatory in each record) */
+struct jr_prefix {
+       uint32_t id;
+       uint32_t total_size;
+       uint64_t timestamp;
+};
+
+struct jr_code_load {
+       struct jr_prefix p;
+
+       uint32_t pid;
+       uint32_t tid;
+       uint64_t vma;
+       uint64_t code_addr;
+       uint64_t code_size;
+       uint64_t code_index;
+};
+
+struct jr_code_close {
+       struct jr_prefix p;
+};
+
+struct jr_code_move {
+       struct jr_prefix p;
+
+       uint32_t pid;
+       uint32_t tid;
+       uint64_t vma;
+       uint64_t old_code_addr;
+       uint64_t new_code_addr;
+       uint64_t code_size;
+       uint64_t code_index;
+};
+
+struct debug_entry {
+       uint64_t addr;
+       int lineno;         /* source line number starting at 1 */
+       int discrim;        /* column discriminator, 0 is default */
+       const char name[0]; /* null terminated filename, \xff\0 if same as previous entry */
+};
+
+struct jr_code_debug_info {
+       struct jr_prefix p;
+
+       uint64_t code_addr;
+       uint64_t nr_entry;
+       struct debug_entry entries[0];
+};
+
+union jr_entry {
+        struct jr_code_debug_info info;
+        struct jr_code_close close;
+        struct jr_code_load load;
+        struct jr_code_move move;
+        struct jr_prefix prefix;
+};
+
+static inline struct debug_entry *
+debug_entry_next(struct debug_entry *ent)
+{
+       void *a = ent + 1;
+       size_t l = strlen(ent->name) + 1;
+       return a + l;
+}
+
+static inline char *
+debug_entry_file(struct debug_entry *ent)
+{
+       void *a = ent + 1;
+       return a;
+}
+
+#endif /* !JITDUMP_H */
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h

index ae825d4ec110fcfe6a06ef5e4daf50b03a214a87..d01e73592f6e34347f0c26531b58aa6ce1d57d52 100644 (file)
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -122,6 +122,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
  
  bool kvm_exit_event(struct perf_evsel *evsel);
  bool kvm_entry_event(struct perf_evsel *evsel);
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
  
  #define define_exit_reasons_table(name, symbols)       \
         static struct exit_reasons_table name[] = {     \
@@ -133,8 +134,13 @@ bool kvm_entry_event(struct perf_evsel *evsel);
   */
  int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
  
-extern const char * const kvm_events_tp[];
+extern const char *kvm_events_tp[];
  extern struct kvm_reg_events_ops kvm_reg_events_ops[];
  extern const char * const kvm_skip_events[];
+extern const char *vcpu_id_str;
+extern const int decode_str_len;
+extern const char *kvm_exit_reason;
+extern const char *kvm_entry_trace;
+extern const char *kvm_exit_trace;
  
  #endif /* __PERF_KVM_STAT_H */
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h

index 2c2b443df5ba796c43ee12c6c0f2061b5d7c7b80..1a3e45baf97fb575c02afe49707727aff0f628ec 100644 (file)
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -179,6 +179,16 @@ struct symbol *machine__find_kernel_symbol(struct machine *machine,
                                        mapp, filter);
  }
  
+static inline
+struct symbol *machine__find_kernel_symbol_by_name(struct machine *machine,
+                                                  enum map_type type, const char *name,
+                                                  struct map **mapp,
+                                                  symbol_filter_t filter)
+{
+       return map_groups__find_symbol_by_name(&machine->kmaps, type, name,
+                                              mapp, filter);
+}
+
  static inline
  struct symbol *machine__find_kernel_function(struct machine *machine, u64 addr,
                                              struct map **mapp,
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c

new file mode 100644 (file)

index 0000000..75465f8
--- /dev/null
+++ b/tools/perf/util/mem-events.c
@@ -0,0 +1,255 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <api/fs/fs.h>
+#include "mem-events.h"
+#include "debug.h"
+#include "symbol.h"
+
+#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+
+struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+       E("ldlat-loads",        "cpu/mem-loads,ldlat=30/P",     "mem-loads"),
+       E("ldlat-stores",       "cpu/mem-stores/P",             "mem-stores"),
+};
+#undef E
+
+#undef E
+
+char *perf_mem_events__name(int i)
+{
+       return (char *)perf_mem_events[i].name;
+}
+
+int perf_mem_events__parse(const char *str)
+{
+       char *tok, *saveptr = NULL;
+       bool found = false;
+       char *buf;
+       int j;
+
+       /* We need buffer that we know we can write to. */
+       buf = malloc(strlen(str) + 1);
+       if (!buf)
+               return -ENOMEM;
+
+       strcpy(buf, str);
+
+       tok = strtok_r((char *)buf, ",", &saveptr);
+
+       while (tok) {
+               for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+                       struct perf_mem_event *e = &perf_mem_events[j];
+
+                       if (strstr(e->tag, tok))
+                               e->record = found = true;
+               }
+
+               tok = strtok_r(NULL, ",", &saveptr);
+       }
+
+       free(buf);
+
+       if (found)
+               return 0;
+
+       pr_err("failed: event '%s' not found, use '-e list' to get list of available events\n", str);
+       return -1;
+}
+
+int perf_mem_events__init(void)
+{
+       const char *mnt = sysfs__mount();
+       bool found = false;
+       int j;
+
+       if (!mnt)
+               return -ENOENT;
+
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               char path[PATH_MAX];
+               struct perf_mem_event *e = &perf_mem_events[j];
+               struct stat st;
+
+               scnprintf(path, PATH_MAX, "%s/devices/cpu/events/%s",
+                         mnt, e->sysfs_name);
+
+               if (!stat(path, &st))
+                       e->supported = found = true;
+       }
+
+       return found ? 0 : -ENOENT;
+}
+
+static const char * const tlb_access[] = {
+       "N/A",
+       "HIT",
+       "MISS",
+       "L1",
+       "L2",
+       "Walker",
+       "Fault",
+};
+
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t l = 0, i;
+       u64 m = PERF_MEM_TLB_NA;
+       u64 hit, miss;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       if (mem_info)
+               m = mem_info->data_src.mem_dtlb;
+
+       hit = m & PERF_MEM_TLB_HIT;
+       miss = m & PERF_MEM_TLB_MISS;
+
+       /* already taken care of */
+       m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
+
+       for (i = 0; m && i < ARRAY_SIZE(tlb_access); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, tlb_access[i]);
+       }
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+       if (hit)
+               l += scnprintf(out + l, sz - l, " hit");
+       if (miss)
+               l += scnprintf(out + l, sz - l, " miss");
+
+       return l;
+}
+
+static const char * const mem_lvl[] = {
+       "N/A",
+       "HIT",
+       "MISS",
+       "L1",
+       "LFB",
+       "L2",
+       "L3",
+       "Local RAM",
+       "Remote RAM (1 hop)",
+       "Remote RAM (2 hops)",
+       "Remote Cache (1 hop)",
+       "Remote Cache (2 hops)",
+       "I/O",
+       "Uncached",
+};
+
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t i, l = 0;
+       u64 m =  PERF_MEM_LVL_NA;
+       u64 hit, miss;
+
+       if (mem_info)
+               m  = mem_info->data_src.mem_lvl;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       hit = m & PERF_MEM_LVL_HIT;
+       miss = m & PERF_MEM_LVL_MISS;
+
+       /* already taken care of */
+       m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
+
+       for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, mem_lvl[i]);
+       }
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+       if (hit)
+               l += scnprintf(out + l, sz - l, " hit");
+       if (miss)
+               l += scnprintf(out + l, sz - l, " miss");
+
+       return l;
+}
+
+static const char * const snoop_access[] = {
+       "N/A",
+       "None",
+       "Miss",
+       "Hit",
+       "HitM",
+};
+
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t i, l = 0;
+       u64 m = PERF_MEM_SNOOP_NA;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       if (mem_info)
+               m = mem_info->data_src.mem_snoop;
+
+       for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, snoop_access[i]);
+       }
+
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+
+       return l;
+}
+
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       u64 mask = PERF_MEM_LOCK_NA;
+       int l;
+
+       if (mem_info)
+               mask = mem_info->data_src.mem_lock;
+
+       if (mask & PERF_MEM_LOCK_NA)
+               l = scnprintf(out, sz, "N/A");
+       else if (mask & PERF_MEM_LOCK_LOCKED)
+               l = scnprintf(out, sz, "Yes");
+       else
+               l = scnprintf(out, sz, "No");
+
+       return l;
+}
+
+int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       int i = 0;
+
+       i += perf_mem__lvl_scnprintf(out, sz, mem_info);
+       i += scnprintf(out + i, sz - i, "|SNP ");
+       i += perf_mem__snp_scnprintf(out + i, sz - i, mem_info);
+       i += scnprintf(out + i, sz - i, "|TLB ");
+       i += perf_mem__tlb_scnprintf(out + i, sz - i, mem_info);
+       i += scnprintf(out + i, sz - i, "|LCK ");
+       i += perf_mem__lck_scnprintf(out + i, sz - i, mem_info);
+
+       return i;
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h

new file mode 100644 (file)

index 0000000..5d6d930
--- /dev/null
+++ b/tools/perf/util/mem-events.h
@@ -0,0 +1,35 @@
+#ifndef __PERF_MEM_EVENTS_H
+#define __PERF_MEM_EVENTS_H
+
+#include <stdbool.h>
+
+struct perf_mem_event {
+       bool            record;
+       bool            supported;
+       const char      *tag;
+       const char      *name;
+       const char      *sysfs_name;
+};
+
+enum {
+       PERF_MEM_EVENTS__LOAD,
+       PERF_MEM_EVENTS__STORE,
+       PERF_MEM_EVENTS__MAX,
+};
+
+extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
+
+int perf_mem_events__parse(const char *str);
+int perf_mem_events__init(void);
+
+char *perf_mem_events__name(int i);
+
+struct mem_info;
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+
+int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info);
+
+#endif /* __PERF_MEM_EVENTS_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index 813d9b272c813b0dd749470f6209aa3a0a0607b4..4c19d5e79d8c4d626eb3fa91486cc1d83447aeeb 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -279,7 +279,24 @@ const char *event_type(int type)
         return "unknown";
  }
  
+static int parse_events__is_name_term(struct parse_events_term *term)
+{
+       return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
+}
  
+static char *get_config_name(struct list_head *head_terms)
+{
+       struct parse_events_term *term;
+
+       if (!head_terms)
+               return NULL;
+
+       list_for_each_entry(term, head_terms, list)
+               if (parse_events__is_name_term(term))
+                       return term->val.str;
+
+       return NULL;
+}
  
  static struct perf_evsel *
  __add_event(struct list_head *list, int *idx,
@@ -333,11 +350,25 @@ static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES]
         return -1;
  }
  
+typedef int config_term_func_t(struct perf_event_attr *attr,
+                              struct parse_events_term *term,
+                              struct parse_events_error *err);
+static int config_term_common(struct perf_event_attr *attr,
+                             struct parse_events_term *term,
+                             struct parse_events_error *err);
+static int config_attr(struct perf_event_attr *attr,
+                      struct list_head *head,
+                      struct parse_events_error *err,
+                      config_term_func_t config_term);
+
  int parse_events_add_cache(struct list_head *list, int *idx,
-                          char *type, char *op_result1, char *op_result2)
+                          char *type, char *op_result1, char *op_result2,
+                          struct parse_events_error *err,
+                          struct list_head *head_config)
  {
         struct perf_event_attr attr;
-       char name[MAX_NAME_LEN];
+       LIST_HEAD(config_terms);
+       char name[MAX_NAME_LEN], *config_name;
         int cache_type = -1, cache_op = -1, cache_result = -1;
         char *op_result[2] = { op_result1, op_result2 };
         int i, n;
@@ -351,6 +382,7 @@ int parse_events_add_cache(struct list_head *list, int *idx,
         if (cache_type == -1)
                 return -EINVAL;
  
+       config_name = get_config_name(head_config);
         n = snprintf(name, MAX_NAME_LEN, "%s", type);
  
         for (i = 0; (i < 2) && (op_result[i]); i++) {
@@ -391,7 +423,16 @@ int parse_events_add_cache(struct list_head *list, int *idx,
         memset(&attr, 0, sizeof(attr));
         attr.config = cache_type | (cache_op << 8) | (cache_result << 16);
         attr.type = PERF_TYPE_HW_CACHE;
-       return add_event(list, idx, &attr, name, NULL);
+
+       if (head_config) {
+               if (config_attr(&attr, head_config, err,
+                               config_term_common))
+                       return -EINVAL;
+
+               if (get_config_terms(head_config, &config_terms))
+                       return -ENOMEM;
+       }
+       return add_event(list, idx, &attr, config_name ? : name, &config_terms);
  }
  
  static void tracepoint_error(struct parse_events_error *e, int err,
@@ -540,6 +581,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
  struct __add_bpf_event_param {
         struct parse_events_evlist *data;
         struct list_head *list;
+       struct list_head *head_config;
  };
  
  static int add_bpf_event(struct probe_trace_event *tev, int fd,
@@ -556,7 +598,8 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
                  tev->group, tev->event, fd);
  
         err = parse_events_add_tracepoint(&new_evsels, &evlist->idx, tev->group,
-                                         tev->event, evlist->error, NULL);
+                                         tev->event, evlist->error,
+                                         param->head_config);
         if (err) {
                 struct perf_evsel *evsel, *tmp;
  
@@ -581,11 +624,12 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
  
  int parse_events_load_bpf_obj(struct parse_events_evlist *data,
                               struct list_head *list,
-                             struct bpf_object *obj)
+                             struct bpf_object *obj,
+                             struct list_head *head_config)
  {
         int err;
         char errbuf[BUFSIZ];
-       struct __add_bpf_event_param param = {data, list};
+       struct __add_bpf_event_param param = {data, list, head_config};
         static bool registered_unprobe_atexit = false;
  
         if (IS_ERR(obj) || !obj) {
@@ -631,17 +675,99 @@ errout:
         return err;
  }
  
+static int
+parse_events_config_bpf(struct parse_events_evlist *data,
+                       struct bpf_object *obj,
+                       struct list_head *head_config)
+{
+       struct parse_events_term *term;
+       int error_pos;
+
+       if (!head_config || list_empty(head_config))
+               return 0;
+
+       list_for_each_entry(term, head_config, list) {
+               char errbuf[BUFSIZ];
+               int err;
+
+               if (term->type_term != PARSE_EVENTS__TERM_TYPE_USER) {
+                       snprintf(errbuf, sizeof(errbuf),
+                                "Invalid config term for BPF object");
+                       errbuf[BUFSIZ - 1] = '\0';
+
+                       data->error->idx = term->err_term;
+                       data->error->str = strdup(errbuf);
+                       return -EINVAL;
+               }
+
+               err = bpf__config_obj(obj, term, data->evlist, &error_pos);
+               if (err) {
+                       bpf__strerror_config_obj(obj, term, data->evlist,
+                                                &error_pos, err, errbuf,
+                                                sizeof(errbuf));
+                       data->error->help = strdup(
+"Hint:\tValid config terms:\n"
+"     \tmap:[<arraymap>].value<indices>=[value]\n"
+"     \tmap:[<eventmap>].event<indices>=[event]\n"
+"\n"
+"     \twhere <indices> is something like [0,3...5] or [all]\n"
+"     \t(add -v to see detail)");
+                       data->error->str = strdup(errbuf);
+                       if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
+                               data->error->idx = term->err_val;
+                       else
+                               data->error->idx = term->err_term + error_pos;
+                       return err;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Split config terms:
+ * perf record -e bpf.c/call-graph=fp,map:array.value[0]=1/ ...
+ *  'call-graph=fp' is 'evt config', should be applied to each
+ *  events in bpf.c.
+ * 'map:array.value[0]=1' is 'obj config', should be processed
+ * with parse_events_config_bpf.
+ *
+ * Move object config terms from the first list to obj_head_config.
+ */
+static void
+split_bpf_config_terms(struct list_head *evt_head_config,
+                      struct list_head *obj_head_config)
+{
+       struct parse_events_term *term, *temp;
+
+       /*
+        * Currectly, all possible user config term
+        * belong to bpf object. parse_events__is_hardcoded_term()
+        * happends to be a good flag.
+        *
+        * See parse_events_config_bpf() and
+        * config_term_tracepoint().
+        */
+       list_for_each_entry_safe(term, temp, evt_head_config, list)
+               if (!parse_events__is_hardcoded_term(term))
+                       list_move_tail(&term->list, obj_head_config);
+}
+
  int parse_events_load_bpf(struct parse_events_evlist *data,
                           struct list_head *list,
                           char *bpf_file_name,
-                         bool source)
+                         bool source,
+                         struct list_head *head_config)
  {
+       int err;
         struct bpf_object *obj;
+       LIST_HEAD(obj_head_config);
+
+       if (head_config)
+               split_bpf_config_terms(head_config, &obj_head_config);
  
         obj = bpf__prepare_load(bpf_file_name, source);
         if (IS_ERR(obj)) {
                 char errbuf[BUFSIZ];
-               int err;
  
                 err = PTR_ERR(obj);
  
@@ -659,7 +785,18 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
                 return err;
         }
  
-       return parse_events_load_bpf_obj(data, list, obj);
+       err = parse_events_load_bpf_obj(data, list, obj, head_config);
+       if (err)
+               return err;
+       err = parse_events_config_bpf(data, obj, &obj_head_config);
+
+       /*
+        * Caller doesn't know anything about obj_head_config,
+        * so combine them together again before returnning.
+        */
+       if (head_config)
+               list_splice_tail(&obj_head_config, head_config);
+       return err;
  }
  
  static int
@@ -746,9 +883,59 @@ static int check_type_val(struct parse_events_term *term,
         return -EINVAL;
  }
  
-typedef int config_term_func_t(struct perf_event_attr *attr,
-                              struct parse_events_term *term,
-                              struct parse_events_error *err);
+/*
+ * Update according to parse-events.l
+ */
+static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = {
+       [PARSE_EVENTS__TERM_TYPE_USER]                  = "<sysfs term>",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG]                = "config",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG1]               = "config1",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG2]               = "config2",
+       [PARSE_EVENTS__TERM_TYPE_NAME]                  = "name",
+       [PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD]         = "period",
+       [PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ]           = "freq",
+       [PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE]    = "branch_type",
+       [PARSE_EVENTS__TERM_TYPE_TIME]                  = "time",
+       [PARSE_EVENTS__TERM_TYPE_CALLGRAPH]             = "call-graph",
+       [PARSE_EVENTS__TERM_TYPE_STACKSIZE]             = "stack-size",
+       [PARSE_EVENTS__TERM_TYPE_NOINHERIT]             = "no-inherit",
+       [PARSE_EVENTS__TERM_TYPE_INHERIT]               = "inherit",
+};
+
+static bool config_term_shrinked;
+
+static bool
+config_term_avail(int term_type, struct parse_events_error *err)
+{
+       if (term_type < 0 || term_type >= __PARSE_EVENTS__TERM_TYPE_NR) {
+               err->str = strdup("Invalid term_type");
+               return false;
+       }
+       if (!config_term_shrinked)
+               return true;
+
+       switch (term_type) {
+       case PARSE_EVENTS__TERM_TYPE_CONFIG:
+       case PARSE_EVENTS__TERM_TYPE_CONFIG1:
+       case PARSE_EVENTS__TERM_TYPE_CONFIG2:
+       case PARSE_EVENTS__TERM_TYPE_NAME:
+               return true;
+       default:
+               if (!err)
+                       return false;
+
+               /* term_type is validated so indexing is safe */
+               if (asprintf(&err->str, "'%s' is not usable in 'perf stat'",
+                            config_term_names[term_type]) < 0)
+                       err->str = NULL;
+               return false;
+       }
+}
+
+void parse_events__shrink_config_terms(void)
+{
+       config_term_shrinked = true;
+}
  
  static int config_term_common(struct perf_event_attr *attr,
                               struct parse_events_term *term,
@@ -815,6 +1002,17 @@ do {                                                                         \
                 return -EINVAL;
         }
  
+       /*
+        * Check term availbility after basic checking so
+        * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
+        *
+        * If check availbility at the entry of this function,
+        * user will see "'<sysfs term>' is not usable in 'perf stat'"
+        * if an invalid config term is provided for legacy events
+        * (for example, instructions/badterm/...), which is confusing.
+        */
+       if (!config_term_avail(term->type_term, err))
+               return -EINVAL;
         return 0;
  #undef CHECK_TYPE_VAL
  }
@@ -961,23 +1159,8 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
                         return -ENOMEM;
         }
  
-       return add_event(list, &data->idx, &attr, NULL, &config_terms);
-}
-
-static int parse_events__is_name_term(struct parse_events_term *term)
-{
-       return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
-}
-
-static char *pmu_event_name(struct list_head *head_terms)
-{
-       struct parse_events_term *term;
-
-       list_for_each_entry(term, head_terms, list)
-               if (parse_events__is_name_term(term))
-                       return term->val.str;
-
-       return NULL;
+       return add_event(list, &data->idx, &attr,
+                        get_config_name(head_config), &config_terms);
  }
  
  int parse_events_add_pmu(struct parse_events_evlist *data,
@@ -1024,7 +1207,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
                 return -EINVAL;
  
         evsel = __add_event(list, &data->idx, &attr,
-                           pmu_event_name(head_config), pmu->cpus,
+                           get_config_name(head_config), pmu->cpus,
                             &config_terms);
         if (evsel) {
                 evsel->unit = info.unit;
@@ -1386,8 +1569,7 @@ int parse_events_terms(struct list_head *terms, const char *str)
                 return 0;
         }
  
-       if (data.terms)
-               parse_events__free_terms(data.terms);
+       parse_events_terms__delete(data.terms);
         return ret;
  }
  
@@ -1395,9 +1577,10 @@ int parse_events(struct perf_evlist *evlist, const char *str,
                  struct parse_events_error *err)
  {
         struct parse_events_evlist data = {
-               .list  = LIST_HEAD_INIT(data.list),
-               .idx   = evlist->nr_entries,
-               .error = err,
+               .list   = LIST_HEAD_INIT(data.list),
+               .idx    = evlist->nr_entries,
+               .error  = err,
+               .evlist = evlist,
         };
         int ret;
  
@@ -2068,12 +2251,29 @@ int parse_events_term__clone(struct parse_events_term **new,
                         term->err_term, term->err_val);
  }
  
-void parse_events__free_terms(struct list_head *terms)
+void parse_events_terms__purge(struct list_head *terms)
  {
         struct parse_events_term *term, *h;
  
-       list_for_each_entry_safe(term, h, terms, list)
+       list_for_each_entry_safe(term, h, terms, list) {
+               if (term->array.nr_ranges)
+                       free(term->array.ranges);
+               list_del_init(&term->list);
                 free(term);
+       }
+}
+
+void parse_events_terms__delete(struct list_head *terms)
+{
+       if (!terms)
+               return;
+       parse_events_terms__purge(terms);
+       free(terms);
+}
+
+void parse_events__clear_array(struct parse_events_array *a)
+{
+       free(a->ranges);
  }
  
  void parse_events_evlist_error(struct parse_events_evlist *data,
@@ -2088,6 +2288,33 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
         WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
  }
  
+static void config_terms_list(char *buf, size_t buf_sz)
+{
+       int i;
+       bool first = true;
+
+       buf[0] = '\0';
+       for (i = 0; i < __PARSE_EVENTS__TERM_TYPE_NR; i++) {
+               const char *name = config_term_names[i];
+
+               if (!config_term_avail(i, NULL))
+                       continue;
+               if (!name)
+                       continue;
+               if (name[0] == '<')
+                       continue;
+
+               if (strlen(buf) + strlen(name) + 2 >= buf_sz)
+                       return;
+
+               if (!first)
+                       strcat(buf, ",");
+               else
+                       first = false;
+               strcat(buf, name);
+       }
+}
+
  /*
   * Return string contains valid config terms of an event.
   * @additional_terms: For terms such as PMU sysfs terms.
@@ -2095,17 +2322,18 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
  char *parse_events_formats_error_string(char *additional_terms)
  {
         char *str;
-       static const char *static_terms = "config,config1,config2,name,"
-                                         "period,freq,branch_type,time,"
-                                         "call-graph,stack-size\n";
+       /* "branch_type" is the longest name */
+       char static_terms[__PARSE_EVENTS__TERM_TYPE_NR *
+                         (sizeof("branch_type") - 1)];
  
+       config_terms_list(static_terms, sizeof(static_terms));
         /* valid terms */
         if (additional_terms) {
-               if (!asprintf(&str, "valid terms: %s,%s",
-                             additional_terms, static_terms))
+               if (asprintf(&str, "valid terms: %s,%s",
+                            additional_terms, static_terms) < 0)
                         goto fail;
         } else {
-               if (!asprintf(&str, "valid terms: %s", static_terms))
+               if (asprintf(&str, "valid terms: %s", static_terms) < 0)
                         goto fail;
         }
         return str;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index f1a6db107241b1c8ffaf03a3514ba1549df6fd1b..67e493088e81c29c17819e94b37da4f83fae073f 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,11 +68,21 @@ enum {
         PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
         PARSE_EVENTS__TERM_TYPE_STACKSIZE,
         PARSE_EVENTS__TERM_TYPE_NOINHERIT,
-       PARSE_EVENTS__TERM_TYPE_INHERIT
+       PARSE_EVENTS__TERM_TYPE_INHERIT,
+       __PARSE_EVENTS__TERM_TYPE_NR,
+};
+
+struct parse_events_array {
+       size_t nr_ranges;
+       struct {
+               unsigned int start;
+               size_t length;
+       } *ranges;
  };
  
  struct parse_events_term {
         char *config;
+       struct parse_events_array array;
         union {
                 char *str;
                 u64  num;
@@ -98,12 +108,14 @@ struct parse_events_evlist {
         int                        idx;
         int                        nr_groups;
         struct parse_events_error *error;
+       struct perf_evlist        *evlist;
  };
  
  struct parse_events_terms {
         struct list_head *terms;
  };
  
+void parse_events__shrink_config_terms(void);
  int parse_events__is_hardcoded_term(struct parse_events_term *term);
  int parse_events_term__num(struct parse_events_term **term,
                            int type_term, char *config, u64 num,
@@ -115,7 +127,9 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
                               char *config, unsigned idx);
  int parse_events_term__clone(struct parse_events_term **new,
                              struct parse_events_term *term);
-void parse_events__free_terms(struct list_head *terms);
+void parse_events_terms__delete(struct list_head *terms);
+void parse_events_terms__purge(struct list_head *terms);
+void parse_events__clear_array(struct parse_events_array *a);
  int parse_events__modifier_event(struct list_head *list, char *str, bool add);
  int parse_events__modifier_group(struct list_head *list, char *event_mod);
  int parse_events_name(struct list_head *list, char *name);
@@ -126,18 +140,22 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
  int parse_events_load_bpf(struct parse_events_evlist *data,
                           struct list_head *list,
                           char *bpf_file_name,
-                         bool source);
+                         bool source,
+                         struct list_head *head_config);
  /* Provide this function for perf test */
  struct bpf_object;
  int parse_events_load_bpf_obj(struct parse_events_evlist *data,
                               struct list_head *list,
-                             struct bpf_object *obj);
+                             struct bpf_object *obj,
+                             struct list_head *head_config);
  int parse_events_add_numeric(struct parse_events_evlist *data,
                              struct list_head *list,
                              u32 type, u64 config,
                              struct list_head *head_config);
  int parse_events_add_cache(struct list_head *list, int *idx,
-                          char *type, char *op_result1, char *op_result2);
+                          char *type, char *op_result1, char *op_result2,
+                          struct parse_events_error *error,
+                          struct list_head *head_config);
  int parse_events_add_breakpoint(struct list_head *list, int *idx,
                                 void *ptr, char *type, u64 len);
  int parse_events_add_pmu(struct parse_events_evlist *data,
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l

index 58c5831ffd5c22133f48a4c1a3a07721c71362fa..1477fbc78993c7b31d7bb1531f630cca8587d93f 100644 (file)
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -9,8 +9,8 @@
  %{
  #include <errno.h>
  #include "../perf.h"
-#include "parse-events-bison.h"
  #include "parse-events.h"
+#include "parse-events-bison.h"
  
  char *parse_events_get_text(yyscan_t yyscanner);
  YYSTYPE *parse_events_get_lval(yyscan_t yyscanner);
@@ -111,6 +111,7 @@ do {                                                        \
  %x mem
  %s config
  %x event
+%x array
  
  group          [^,{}/]*[{][^}]*[}][^,{}/]*
  event_pmu      [^,{}/]+[/][^/]*[/][^,{}/]*
@@ -122,7 +123,7 @@ num_dec             [0-9]+
  num_hex                0x[a-fA-F0-9]+
  num_raw_hex    [a-fA-F0-9]+
  name           [a-zA-Z_*?][a-zA-Z0-9_*?.]*
-name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.]*
+name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]*
  /* If you add a modifier you need to update check_modifier() */
  modifier_event [ukhpPGHSDI]+
  modifier_bp    [rwx]{1,3}
@@ -176,10 +177,17 @@ modifier_bp       [rwx]{1,3}
  
  }
  
+<array>{
+"]"                    { BEGIN(config); return ']'; }
+{num_dec}              { return value(yyscanner, 10); }
+{num_hex}              { return value(yyscanner, 16); }
+,                      { return ','; }
+"\.\.\."               { return PE_ARRAY_RANGE; }
+}
+
  <config>{
         /*
-        * Please update parse_events_formats_error_string any time
-        * new static term is added.
+        * Please update config_term_names when new static term is added.
          */
  config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
  config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
@@ -196,6 +204,8 @@ no-inherit          { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
  ,                      { return ','; }
  "/"                    { BEGIN(INITIAL); return '/'; }
  {name_minus}           { return str(yyscanner, PE_NAME); }
+\[all\]                        { return PE_ARRAY_ALL; }
+"["                    { BEGIN(array); return '['; }
  }
  
  <mem>{
@@ -238,6 +248,7 @@ cpu-migrations|migrations                   { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COU
  alignment-faults                               { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); }
  emulation-faults                               { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
  dummy                                          { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
+bpf-output                                     { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
  
         /*
          * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y

index ad379968d4c10c0fb7bb2ddc3bacce3dc1166f43..5be4a5f216d6d23889e07e225810fec40df67c30 100644 (file)
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -28,7 +28,7 @@ do { \
         INIT_LIST_HEAD(list);         \
  } while (0)
  
-static inc_group_count(struct list_head *list,
+static void inc_group_count(struct list_head *list,
                        struct parse_events_evlist *data)
  {
         /* Count groups only have more than 1 members */
@@ -48,6 +48,7 @@ static inc_group_count(struct list_head *list,
  %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
  %token PE_ERROR
  %token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
+%token PE_ARRAY_ALL PE_ARRAY_RANGE
  %type <num> PE_VALUE
  %type <num> PE_VALUE_SYM_HW
  %type <num> PE_VALUE_SYM_SW
@@ -64,6 +65,7 @@ static inc_group_count(struct list_head *list,
  %type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
  %type <num> value_sym
  %type <head> event_config
+%type <head> opt_event_config
  %type <term> event_term
  %type <head> event_pmu
  %type <head> event_legacy_symbol
@@ -82,6 +84,9 @@ static inc_group_count(struct list_head *list,
  %type <head> group_def
  %type <head> group
  %type <head> groups
+%type <array> array
+%type <array> array_term
+%type <array> array_terms
  
  %union
  {
@@ -93,6 +98,7 @@ static inc_group_count(struct list_head *list,
                 char *sys;
                 char *event;
         } tracepoint_name;
+       struct parse_events_array array;
  }
  %%
  
@@ -211,24 +217,14 @@ event_def: event_pmu |
            event_bpf_file
  
  event_pmu:
-PE_NAME '/' event_config '/'
+PE_NAME opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
-       parse_events__free_terms($3);
-       $$ = list;
-}
-|
-PE_NAME '/' '/'
-{
-       struct parse_events_evlist *data = _data;
-       struct list_head *list;
-
-       ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
+       ABORT_ON(parse_events_add_pmu(data, list, $1, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  |
@@ -246,7 +242,7 @@ PE_KERNEL_PMU_EVENT sep_dc
  
         ALLOC_LIST(list);
         ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events__free_terms(head);
+       parse_events_terms__delete(head);
         $$ = list;
  }
  |
@@ -266,7 +262,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
  
         ALLOC_LIST(list);
         ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events__free_terms(head);
+       parse_events_terms__delete(head);
         $$ = list;
  }
  
@@ -285,7 +281,7 @@ value_sym '/' event_config '/'
  
         ALLOC_LIST(list);
         ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
-       parse_events__free_terms($3);
+       parse_events_terms__delete($3);
         $$ = list;
  }
  |
@@ -302,33 +298,39 @@ value_sym sep_slash_dc
  }
  
  event_legacy_cache:
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_event_config
  {
         struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5, error, $6));
+       parse_events_terms__delete($6);
         $$ = list;
  }
  |
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
  {
         struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL, error, $4));
+       parse_events_terms__delete($4);
         $$ = list;
  }
  |
-PE_NAME_CACHE_TYPE
+PE_NAME_CACHE_TYPE opt_event_config
  {
         struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL, error, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  
@@ -378,24 +380,7 @@ PE_PREFIX_MEM PE_VALUE sep_dc
  }
  
  event_legacy_tracepoint:
-tracepoint_name
-{
-       struct parse_events_evlist *data = _data;
-       struct parse_events_error *error = data->error;
-       struct list_head *list;
-
-       ALLOC_LIST(list);
-       if (error)
-               error->idx = @1.first_column;
-
-       if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-                                       error, NULL))
-               return -1;
-
-       $$ = list;
-}
-|
-tracepoint_name '/' event_config '/'
+tracepoint_name opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct parse_events_error *error = data->error;
@@ -406,7 +391,7 @@ tracepoint_name '/' event_config '/'
                 error->idx = @1.first_column;
  
         if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-                                       error, $3))
+                                       error, $2))
                 return -1;
  
         $$ = list;
@@ -433,49 +418,68 @@ PE_NAME ':' PE_NAME
  }
  
  event_legacy_numeric:
-PE_VALUE ':' PE_VALUE
+PE_VALUE ':' PE_VALUE opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, $4));
+       parse_events_terms__delete($4);
         $$ = list;
  }
  
  event_legacy_raw:
-PE_RAW
+PE_RAW opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  
  event_bpf_file:
-PE_BPF_OBJECT
+PE_BPF_OBJECT opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_load_bpf(data, list, $1, false));
+       ABORT_ON(parse_events_load_bpf(data, list, $1, false, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  |
-PE_BPF_SOURCE
+PE_BPF_SOURCE opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_load_bpf(data, list, $1, true));
+       ABORT_ON(parse_events_load_bpf(data, list, $1, true, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  
+opt_event_config:
+'/' event_config '/'
+{
+       $$ = $2;
+}
+|
+'/' '/'
+{
+       $$ = NULL;
+}
+|
+{
+       $$ = NULL;
+}
+
  start_terms: event_config
  {
         struct parse_events_terms *data = _data;
@@ -573,6 +577,86 @@ PE_TERM
         ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
         $$ = term;
  }
+|
+PE_NAME array '=' PE_NAME
+{
+       struct parse_events_term *term;
+       int i;
+
+       ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                       $1, $4, &@1, &@4));
+
+       term->array = $2;
+       $$ = term;
+}
+|
+PE_NAME array '=' PE_VALUE
+{
+       struct parse_events_term *term;
+
+       ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                       $1, $4, &@1, &@4));
+       term->array = $2;
+       $$ = term;
+}
+
+array:
+'[' array_terms ']'
+{
+       $$ = $2;
+}
+|
+PE_ARRAY_ALL
+{
+       $$.nr_ranges = 0;
+       $$.ranges = NULL;
+}
+
+array_terms:
+array_terms ',' array_term
+{
+       struct parse_events_array new_array;
+
+       new_array.nr_ranges = $1.nr_ranges + $3.nr_ranges;
+       new_array.ranges = malloc(sizeof(new_array.ranges[0]) *
+                                 new_array.nr_ranges);
+       ABORT_ON(!new_array.ranges);
+       memcpy(&new_array.ranges[0], $1.ranges,
+              $1.nr_ranges * sizeof(new_array.ranges[0]));
+       memcpy(&new_array.ranges[$1.nr_ranges], $3.ranges,
+              $3.nr_ranges * sizeof(new_array.ranges[0]));
+       free($1.ranges);
+       free($3.ranges);
+       $$ = new_array;
+}
+|
+array_term
+
+array_term:
+PE_VALUE
+{
+       struct parse_events_array array;
+
+       array.nr_ranges = 1;
+       array.ranges = malloc(sizeof(array.ranges[0]));
+       ABORT_ON(!array.ranges);
+       array.ranges[0].start = $1;
+       array.ranges[0].length = 1;
+       $$ = array;
+}
+|
+PE_VALUE PE_ARRAY_RANGE PE_VALUE
+{
+       struct parse_events_array array;
+
+       ABORT_ON($3 < $1);
+       array.nr_ranges = 1;
+       array.ranges = malloc(sizeof(array.ranges[0]));
+       ABORT_ON(!array.ranges);
+       array.ranges[0].start = $1;
+       array.ranges[0].length = $3 - $1 + 1;
+       $$ = array;
+}
  
  sep_dc: ':' |
  
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c

index b597bcc8fc781f4fa2c631044bddaab447b756e9..adef23b1352e836fec9f0f9a5290c578bb25cc43 100644 (file)
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -98,7 +98,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
         char scale[128];
         int fd, ret = -1;
         char path[PATH_MAX];
-       const char *lc;
+       char *lc;
  
         snprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
  
@@ -123,6 +123,17 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
          */
         lc = setlocale(LC_NUMERIC, NULL);
  
+       /*
+        * The lc string may be allocated in static storage,
+        * so get a dynamic copy to make it survive setlocale
+        * call below.
+        */
+       lc = strdup(lc);
+       if (!lc) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
         /*
          * force to C locale to ensure kernel
          * scale string is converted correctly.
@@ -135,6 +146,8 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
         /* restore locale */
         setlocale(LC_NUMERIC, lc);
  
+       free(lc);
+
         ret = 0;
  error:
         close(fd);
@@ -153,7 +166,7 @@ static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *n
         if (fd == -1)
                 return -1;
  
-               sret = read(fd, alias->unit, UNIT_MAX_LEN);
+       sret = read(fd, alias->unit, UNIT_MAX_LEN);
         if (sret < 0)
                 goto error;
  
@@ -284,13 +297,12 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
  {
         struct dirent *evt_ent;
         DIR *event_dir;
-       int ret = 0;
  
         event_dir = opendir(dir);
         if (!event_dir)
                 return -EINVAL;
  
-       while (!ret && (evt_ent = readdir(event_dir))) {
+       while ((evt_ent = readdir(event_dir))) {
                 char path[PATH_MAX];
                 char *name = evt_ent->d_name;
                 FILE *file;
@@ -306,17 +318,19 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
  
                 snprintf(path, PATH_MAX, "%s/%s", dir, name);
  
-               ret = -EINVAL;
                 file = fopen(path, "r");
-               if (!file)
-                       break;
+               if (!file) {
+                       pr_debug("Cannot open %s\n", path);
+                       continue;
+               }
  
-               ret = perf_pmu__new_alias(head, dir, name, file);
+               if (perf_pmu__new_alias(head, dir, name, file) < 0)
+                       pr_debug("Cannot set up %s\n", name);
                 fclose(file);
         }
  
         closedir(event_dir);
-       return ret;
+       return 0;
  }
  
  /*
@@ -354,7 +368,7 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias,
         list_for_each_entry(term, &alias->terms, list) {
                 ret = parse_events_term__clone(&cloned, term);
                 if (ret) {
-                       parse_events__free_terms(&list);
+                       parse_events_terms__purge(&list);
                         return ret;
                 }
                 list_add_tail(&cloned->list, &list);
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c

index 544509c159cec4111e5865007b513e1197c576c4..b3aabc0d4eb0096fff41fb078a09d77988f103ff 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -187,6 +187,9 @@ static void define_event_symbols(struct event_format *event,
                                  const char *ev_name,
                                  struct print_arg *args)
  {
+       if (args == NULL)
+               return;
+
         switch (args->type) {
         case PRINT_NULL:
                 break;
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c

index d72fafc1c800db1a699ac37e4ecb4acd0f1e3326..fbd05242b4e59786ca0e081a52729248d780f5a0 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -205,6 +205,9 @@ static void define_event_symbols(struct event_format *event,
                                  const char *ev_name,
                                  struct print_arg *args)
  {
+       if (args == NULL)
+               return;
+
         switch (args->type) {
         case PRINT_NULL:
                 break;
@@ -1091,8 +1094,6 @@ static int python_start_script(const char *script, int argc, const char **argv)
                 goto error;
         }
  
-       free(command_line);
-
         set_table_handlers(tables);
  
         if (tables->db_export_mode) {
@@ -1101,6 +1102,8 @@ static int python_start_script(const char *script, int argc, const char **argv)
                         goto error;
         }
  
+       free(command_line);
+
         return err;
  error:
         Py_Finalize();
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index 40b7a0d0905b8d7f01972c6e38e225f108b15133..60b3593d210dbc3b52997a693bcc8f9645687f08 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -240,14 +240,6 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
         return 0;
  }
  
-static int process_build_id_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
  static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
                                        union perf_event *event __maybe_unused,
                                        struct ordered_events *oe __maybe_unused)
@@ -260,23 +252,6 @@ static int process_finished_round(struct perf_tool *tool,
                                   union perf_event *event,
                                   struct ordered_events *oe);
  
-static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *perf_session
-                                __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
-static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
-                               union perf_event *event __maybe_unused,
-                               struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
  static int skipn(int fd, off_t n)
  {
         char buf[4096];
@@ -303,10 +278,9 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
         return event->auxtrace.size;
  }
  
-static
-int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
-                                     union perf_event *event __maybe_unused,
-                                     struct perf_session *session __maybe_unused)
+static int process_event_op2_stub(struct perf_tool *tool __maybe_unused,
+                                 union perf_event *event __maybe_unused,
+                                 struct perf_session *session __maybe_unused)
  {
         dump_printf(": unhandled!\n");
         return 0;
@@ -410,7 +384,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
         if (tool->tracing_data == NULL)
                 tool->tracing_data = process_event_synth_tracing_data_stub;
         if (tool->build_id == NULL)
-               tool->build_id = process_build_id_stub;
+               tool->build_id = process_event_op2_stub;
         if (tool->finished_round == NULL) {
                 if (tool->ordered_events)
                         tool->finished_round = process_finished_round;
@@ -418,13 +392,13 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
                         tool->finished_round = process_finished_round_stub;
         }
         if (tool->id_index == NULL)
-               tool->id_index = process_id_index_stub;
+               tool->id_index = process_event_op2_stub;
         if (tool->auxtrace_info == NULL)
-               tool->auxtrace_info = process_event_auxtrace_info_stub;
+               tool->auxtrace_info = process_event_op2_stub;
         if (tool->auxtrace == NULL)
                 tool->auxtrace = process_event_auxtrace_stub;
         if (tool->auxtrace_error == NULL)
-               tool->auxtrace_error = process_event_auxtrace_error_stub;
+               tool->auxtrace_error = process_event_op2_stub;
         if (tool->thread_map == NULL)
                 tool->thread_map = process_event_thread_map_stub;
         if (tool->cpu_map == NULL)
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py

index 1833103768cb99fbdbb92ea8910b08488fd2aac3..c8680984d2d6680f56a974a1b54056a5e74263e1 100644 (file)
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -22,6 +22,7 @@ cflags = getenv('CFLAGS', '').split()
  # switch off several checks (need to be at the end of cflags list)
  cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter' ]
  
+src_perf  = getenv('srctree') + '/tools/perf'
  build_lib = getenv('PYTHON_EXTBUILD_LIB')
  build_tmp = getenv('PYTHON_EXTBUILD_TMP')
  libtraceevent = getenv('LIBTRACEEVENT')
@@ -30,6 +31,9 @@ libapikfs = getenv('LIBAPI')
  ext_sources = [f.strip() for f in file('util/python-ext-sources')
                                 if len(f.strip()) > 0 and f[0] != '#']
  
+# use full paths with source files
+ext_sources = map(lambda x: '%s/%s' % (src_perf, x) , ext_sources)
+
  perf = Extension('perf',
                   sources = ext_sources,
                   include_dirs = ['util/include'],
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c

index ec722346e6ffb8dd6531e94576bb15b548a1487b..93fa136b0025c02e1e4383be523fcf37c092f36e 100644 (file)
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -6,6 +6,7 @@
  #include "evsel.h"
  #include "evlist.h"
  #include <traceevent/event-parse.h>
+#include "mem-events.h"
  
  regex_t                parent_regex;
  const char     default_parent_pattern[] = "^sys_|^do_page_fault";
@@ -25,9 +26,19 @@ int          sort__has_parent = 0;
  int            sort__has_sym = 0;
  int            sort__has_dso = 0;
  int            sort__has_socket = 0;
+int            sort__has_thread = 0;
+int            sort__has_comm = 0;
  enum sort_mode sort__mode = SORT_MODE__NORMAL;
  
-
+/*
+ * Replaces all occurrences of a char used with the:
+ *
+ * -t, --field-separator
+ *
+ * option, that uses a special separator character and don't pad with spaces,
+ * replacing all occurances of this separator in symbol names (and other
+ * output) with a '.' character, that thus it's the only non valid separator.
+*/
  static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...)
  {
         int n;
@@ -80,10 +91,21 @@ static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
                                width, width, comm ?: "");
  }
  
+static int hist_entry__thread_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const struct thread *th = arg;
+
+       if (type != HIST_FILTER__THREAD)
+               return -1;
+
+       return th && he->thread != th;
+}
+
  struct sort_entry sort_thread = {
         .se_header      = "  Pid:Command",
         .se_cmp         = sort__thread_cmp,
         .se_snprintf    = hist_entry__thread_snprintf,
+       .se_filter      = hist_entry__thread_filter,
         .se_width_idx   = HISTC_THREAD,
  };
  
@@ -121,6 +143,7 @@ struct sort_entry sort_comm = {
         .se_collapse    = sort__comm_collapse,
         .se_sort        = sort__comm_sort,
         .se_snprintf    = hist_entry__comm_snprintf,
+       .se_filter      = hist_entry__thread_filter,
         .se_width_idx   = HISTC_COMM,
  };
  
@@ -170,10 +193,21 @@ static int hist_entry__dso_snprintf(struct hist_entry *he, char *bf,
         return _hist_entry__dso_snprintf(he->ms.map, bf, size, width);
  }
  
+static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->ms.map || he->ms.map->dso != dso);
+}
+
  struct sort_entry sort_dso = {
         .se_header      = "Shared Object",
         .se_cmp         = sort__dso_cmp,
         .se_snprintf    = hist_entry__dso_snprintf,
+       .se_filter      = hist_entry__dso_filter,
         .se_width_idx   = HISTC_DSO,
  };
  
@@ -246,10 +280,8 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
                         ret += repsep_snprintf(bf + ret, size - ret, "%s", sym->name);
                         ret += repsep_snprintf(bf + ret, size - ret, "+0x%llx",
                                         ip - map->unmap_ip(map, sym->start));
-                       ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-                                      width - ret, "");
                 } else {
-                       ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+                       ret += repsep_snprintf(bf + ret, size - ret, "%.*s",
                                                width - ret,
                                                sym->name);
                 }
@@ -257,14 +289,9 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
                 size_t len = BITS_PER_LONG / 4;
                 ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx",
                                        len, ip);
-               ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-                                      width - ret, "");
         }
  
-       if (ret > width)
-               bf[width] = '\0';
-
-       return width;
+       return ret;
  }
  
  static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
@@ -274,46 +301,56 @@ static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
                                          he->level, bf, size, width);
  }
  
+static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && (!he->ms.sym || !strstr(he->ms.sym->name, sym));
+}
+
  struct sort_entry sort_sym = {
         .se_header      = "Symbol",
         .se_cmp         = sort__sym_cmp,
         .se_sort        = sort__sym_sort,
         .se_snprintf    = hist_entry__sym_snprintf,
+       .se_filter      = hist_entry__sym_filter,
         .se_width_idx   = HISTC_SYMBOL,
  };
  
  /* --sort srcline */
  
+static char *hist_entry__get_srcline(struct hist_entry *he)
+{
+       struct map *map = he->ms.map;
+
+       if (!map)
+               return SRCLINE_UNKNOWN;
+
+       return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
+                          he->ms.sym, true);
+}
+
  static int64_t
  sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
  {
-       if (!left->srcline) {
-               if (!left->ms.map)
-                       left->srcline = SRCLINE_UNKNOWN;
-               else {
-                       struct map *map = left->ms.map;
-                       left->srcline = get_srcline(map->dso,
-                                          map__rip_2objdump(map, left->ip),
-                                                   left->ms.sym, true);
-               }
-       }
-       if (!right->srcline) {
-               if (!right->ms.map)
-                       right->srcline = SRCLINE_UNKNOWN;
-               else {
-                       struct map *map = right->ms.map;
-                       right->srcline = get_srcline(map->dso,
-                                            map__rip_2objdump(map, right->ip),
-                                                    right->ms.sym, true);
-               }
-       }
+       if (!left->srcline)
+               left->srcline = hist_entry__get_srcline(left);
+       if (!right->srcline)
+               right->srcline = hist_entry__get_srcline(right);
+
         return strcmp(right->srcline, left->srcline);
  }
  
  static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
                                         size_t size, unsigned int width)
  {
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcline);
+       if (!he->srcline)
+               he->srcline = hist_entry__get_srcline(he);
+
+       return repsep_snprintf(bf, size, "%-.*s", width, he->srcline);
  }
  
  struct sort_entry sort_srcline = {
@@ -327,11 +364,14 @@ struct sort_entry sort_srcline = {
  
  static char no_srcfile[1];
  
-static char *get_srcfile(struct hist_entry *e)
+static char *hist_entry__get_srcfile(struct hist_entry *e)
  {
         char *sf, *p;
         struct map *map = e->ms.map;
  
+       if (!map)
+               return no_srcfile;
+
         sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
                          e->ms.sym, false, true);
         if (!strcmp(sf, SRCLINE_UNKNOWN))
@@ -348,25 +388,21 @@ static char *get_srcfile(struct hist_entry *e)
  static int64_t
  sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
  {
-       if (!left->srcfile) {
-               if (!left->ms.map)
-                       left->srcfile = no_srcfile;
-               else
-                       left->srcfile = get_srcfile(left);
-       }
-       if (!right->srcfile) {
-               if (!right->ms.map)
-                       right->srcfile = no_srcfile;
-               else
-                       right->srcfile = get_srcfile(right);
-       }
+       if (!left->srcfile)
+               left->srcfile = hist_entry__get_srcfile(left);
+       if (!right->srcfile)
+               right->srcfile = hist_entry__get_srcfile(right);
+
         return strcmp(right->srcfile, left->srcfile);
  }
  
  static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
                                         size_t size, unsigned int width)
  {
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcfile);
+       if (!he->srcfile)
+               he->srcfile = hist_entry__get_srcfile(he);
+
+       return repsep_snprintf(bf, size, "%-.*s", width, he->srcfile);
  }
  
  struct sort_entry sort_srcfile = {
@@ -439,10 +475,21 @@ static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
         return repsep_snprintf(bf, size, "%*.*d", width, width-3, he->socket);
  }
  
+static int hist_entry__socket_filter(struct hist_entry *he, int type, const void *arg)
+{
+       int sk = *(const int *)arg;
+
+       if (type != HIST_FILTER__SOCKET)
+               return -1;
+
+       return sk >= 0 && he->socket != sk;
+}
+
  struct sort_entry sort_socket = {
         .se_header      = "Socket",
         .se_cmp         = sort__socket_cmp,
         .se_snprintf    = hist_entry__socket_snprintf,
+       .se_filter      = hist_entry__socket_filter,
         .se_width_idx   = HISTC_SOCKET,
  };
  
@@ -483,9 +530,6 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
         if (right->trace_output == NULL)
                 right->trace_output = get_trace_output(right);
  
-       hists__new_col_len(left->hists, HISTC_TRACE, strlen(left->trace_output));
-       hists__new_col_len(right->hists, HISTC_TRACE, strlen(right->trace_output));
-
         return strcmp(right->trace_output, left->trace_output);
  }
  
@@ -496,11 +540,11 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
  
         evsel = hists_to_evsel(he->hists);
         if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
-               return scnprintf(bf, size, "%-*.*s", width, width, "N/A");
+               return scnprintf(bf, size, "%-.*s", width, "N/A");
  
         if (he->trace_output == NULL)
                 he->trace_output = get_trace_output(he);
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->trace_output);
+       return repsep_snprintf(bf, size, "%-.*s", width, he->trace_output);
  }
  
  struct sort_entry sort_trace = {
@@ -532,6 +576,18 @@ static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
                 return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
  }
  
+static int hist_entry__dso_from_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->branch_info || !he->branch_info->from.map ||
+                      he->branch_info->from.map->dso != dso);
+}
+
  static int64_t
  sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@ -552,6 +608,18 @@ static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
                 return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
  }
  
+static int hist_entry__dso_to_filter(struct hist_entry *he, int type,
+                                    const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->branch_info || !he->branch_info->to.map ||
+                      he->branch_info->to.map->dso != dso);
+}
+
  static int64_t
  sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@ -613,10 +681,35 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *he, char *bf,
         return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
  }
  
+static int hist_entry__sym_from_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && !(he->branch_info && he->branch_info->from.sym &&
+                       strstr(he->branch_info->from.sym->name, sym));
+}
+
+static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && !(he->branch_info && he->branch_info->to.sym &&
+                       strstr(he->branch_info->to.sym->name, sym));
+}
+
  struct sort_entry sort_dso_from = {
         .se_header      = "Source Shared Object",
         .se_cmp         = sort__dso_from_cmp,
         .se_snprintf    = hist_entry__dso_from_snprintf,
+       .se_filter      = hist_entry__dso_from_filter,
         .se_width_idx   = HISTC_DSO_FROM,
  };
  
@@ -624,6 +717,7 @@ struct sort_entry sort_dso_to = {
         .se_header      = "Target Shared Object",
         .se_cmp         = sort__dso_to_cmp,
         .se_snprintf    = hist_entry__dso_to_snprintf,
+       .se_filter      = hist_entry__dso_to_filter,
         .se_width_idx   = HISTC_DSO_TO,
  };
  
@@ -631,6 +725,7 @@ struct sort_entry sort_sym_from = {
         .se_header      = "Source Symbol",
         .se_cmp         = sort__sym_from_cmp,
         .se_snprintf    = hist_entry__sym_from_snprintf,
+       .se_filter      = hist_entry__sym_from_filter,
         .se_width_idx   = HISTC_SYMBOL_FROM,
  };
  
@@ -638,6 +733,7 @@ struct sort_entry sort_sym_to = {
         .se_header      = "Target Symbol",
         .se_cmp         = sort__sym_to_cmp,
         .se_snprintf    = hist_entry__sym_to_snprintf,
+       .se_filter      = hist_entry__sym_to_filter,
         .se_width_idx   = HISTC_SYMBOL_TO,
  };
  
@@ -797,20 +893,10 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
  static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
-       const char *out;
-       u64 mask = PERF_MEM_LOCK_NA;
+       char out[10];
  
-       if (he->mem_info)
-               mask = he->mem_info->data_src.mem_lock;
-
-       if (mask & PERF_MEM_LOCK_NA)
-               out = "N/A";
-       else if (mask & PERF_MEM_LOCK_LOCKED)
-               out = "Yes";
-       else
-               out = "No";
-
-       return repsep_snprintf(bf, size, "%-*s", width, out);
+       perf_mem__lck_scnprintf(out, sizeof(out), he->mem_info);
+       return repsep_snprintf(bf, size, "%.*s", width, out);
  }
  
  static int64_t
@@ -832,54 +918,12 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
         return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
  }
  
-static const char * const tlb_access[] = {
-       "N/A",
-       "HIT",
-       "MISS",
-       "L1",
-       "L2",
-       "Walker",
-       "Fault",
-};
-#define NUM_TLB_ACCESS (sizeof(tlb_access)/sizeof(const char *))
-
  static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
         char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t l = 0, i;
-       u64 m = PERF_MEM_TLB_NA;
-       u64 hit, miss;
-
-       out[0] = '\0';
-
-       if (he->mem_info)
-               m = he->mem_info->data_src.mem_dtlb;
-
-       hit = m & PERF_MEM_TLB_HIT;
-       miss = m & PERF_MEM_TLB_MISS;
-
-       /* already taken care of */
-       m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
-
-       for (i = 0; m && i < NUM_TLB_ACCESS; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, tlb_access[i], sz - l);
-               l += strlen(tlb_access[i]);
-       }
-       if (*out == '\0')
-               strcpy(out, "N/A");
-       if (hit)
-               strncat(out, " hit", sz - l);
-       if (miss)
-               strncat(out, " miss", sz - l);
  
+       perf_mem__tlb_scnprintf(out, sizeof(out), he->mem_info);
         return repsep_snprintf(bf, size, "%-*s", width, out);
  }
  
@@ -902,61 +946,12 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
         return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
  }
  
-static const char * const mem_lvl[] = {
-       "N/A",
-       "HIT",
-       "MISS",
-       "L1",
-       "LFB",
-       "L2",
-       "L3",
-       "Local RAM",
-       "Remote RAM (1 hop)",
-       "Remote RAM (2 hops)",
-       "Remote Cache (1 hop)",
-       "Remote Cache (2 hops)",
-       "I/O",
-       "Uncached",
-};
-#define NUM_MEM_LVL (sizeof(mem_lvl)/sizeof(const char *))
-
  static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
         char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t i, l = 0;
-       u64 m =  PERF_MEM_LVL_NA;
-       u64 hit, miss;
-
-       if (he->mem_info)
-               m  = he->mem_info->data_src.mem_lvl;
-
-       out[0] = '\0';
-
-       hit = m & PERF_MEM_LVL_HIT;
-       miss = m & PERF_MEM_LVL_MISS;
-
-       /* already taken care of */
-       m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
-
-       for (i = 0; m && i < NUM_MEM_LVL; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, mem_lvl[i], sz - l);
-               l += strlen(mem_lvl[i]);
-       }
-       if (*out == '\0')
-               strcpy(out, "N/A");
-       if (hit)
-               strncat(out, " hit", sz - l);
-       if (miss)
-               strncat(out, " miss", sz - l);
  
+       perf_mem__lvl_scnprintf(out, sizeof(out), he->mem_info);
         return repsep_snprintf(bf, size, "%-*s", width, out);
  }
  
@@ -979,51 +974,15 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
         return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
  }
  
-static const char * const snoop_access[] = {
-       "N/A",
-       "None",
-       "Miss",
-       "Hit",
-       "HitM",
-};
-#define NUM_SNOOP_ACCESS (sizeof(snoop_access)/sizeof(const char *))
-
  static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
         char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t i, l = 0;
-       u64 m = PERF_MEM_SNOOP_NA;
-
-       out[0] = '\0';
-
-       if (he->mem_info)
-               m = he->mem_info->data_src.mem_snoop;
-
-       for (i = 0; m && i < NUM_SNOOP_ACCESS; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, snoop_access[i], sz - l);
-               l += strlen(snoop_access[i]);
-       }
-
-       if (*out == '\0')
-               strcpy(out, "N/A");
  
+       perf_mem__snp_scnprintf(out, sizeof(out), he->mem_info);
         return repsep_snprintf(bf, size, "%-*s", width, out);
  }
  
-static inline  u64 cl_address(u64 address)
-{
-       /* return the cacheline of the address */
-       return (address & ~(cacheline_size - 1));
-}
-
  static int64_t
  sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@ -1440,20 +1399,6 @@ struct hpp_sort_entry {
         struct sort_entry *se;
  };
  
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
-{
-       struct hpp_sort_entry *hse_a;
-       struct hpp_sort_entry *hse_b;
-
-       if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
-               return false;
-
-       hse_a = container_of(a, struct hpp_sort_entry, hpp);
-       hse_b = container_of(b, struct hpp_sort_entry, hpp);
-
-       return hse_a->se == hse_b->se;
-}
-
  void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
  {
         struct hpp_sort_entry *hse;
@@ -1539,8 +1484,56 @@ static int64_t __sort__hpp_sort(struct perf_hpp_fmt *fmt,
         return sort_fn(a, b);
  }
  
+bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+{
+       return format->header == __sort__hpp_header;
+}
+
+#define MK_SORT_ENTRY_CHK(key)                                 \
+bool perf_hpp__is_ ## key ## _entry(struct perf_hpp_fmt *fmt)  \
+{                                                              \
+       struct hpp_sort_entry *hse;                             \
+                                                               \
+       if (!perf_hpp__is_sort_entry(fmt))                      \
+               return false;                                   \
+                                                               \
+       hse = container_of(fmt, struct hpp_sort_entry, hpp);    \
+       return hse->se == &sort_ ## key ;                       \
+}
+
+MK_SORT_ENTRY_CHK(trace)
+MK_SORT_ENTRY_CHK(srcline)
+MK_SORT_ENTRY_CHK(srcfile)
+MK_SORT_ENTRY_CHK(thread)
+MK_SORT_ENTRY_CHK(comm)
+MK_SORT_ENTRY_CHK(dso)
+MK_SORT_ENTRY_CHK(sym)
+
+
+static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       struct hpp_sort_entry *hse_a;
+       struct hpp_sort_entry *hse_b;
+
+       if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
+               return false;
+
+       hse_a = container_of(a, struct hpp_sort_entry, hpp);
+       hse_b = container_of(b, struct hpp_sort_entry, hpp);
+
+       return hse_a->se == hse_b->se;
+}
+
+static void hse_free(struct perf_hpp_fmt *fmt)
+{
+       struct hpp_sort_entry *hse;
+
+       hse = container_of(fmt, struct hpp_sort_entry, hpp);
+       free(hse);
+}
+
  static struct hpp_sort_entry *
-__sort_dimension__alloc_hpp(struct sort_dimension *sd)
+__sort_dimension__alloc_hpp(struct sort_dimension *sd, int level)
  {
         struct hpp_sort_entry *hse;
  
@@ -1560,40 +1553,92 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
         hse->hpp.cmp = __sort__hpp_cmp;
         hse->hpp.collapse = __sort__hpp_collapse;
         hse->hpp.sort = __sort__hpp_sort;
+       hse->hpp.equal = __sort__hpp_equal;
+       hse->hpp.free = hse_free;
  
         INIT_LIST_HEAD(&hse->hpp.list);
         INIT_LIST_HEAD(&hse->hpp.sort_list);
         hse->hpp.elide = false;
         hse->hpp.len = 0;
         hse->hpp.user_len = 0;
+       hse->hpp.level = level;
  
         return hse;
  }
  
-bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+static void hpp_free(struct perf_hpp_fmt *fmt)
  {
-       return format->header == __sort__hpp_header;
+       free(fmt);
+}
+
+static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd,
+                                                      int level)
+{
+       struct perf_hpp_fmt *fmt;
+
+       fmt = memdup(hd->fmt, sizeof(*fmt));
+       if (fmt) {
+               INIT_LIST_HEAD(&fmt->list);
+               INIT_LIST_HEAD(&fmt->sort_list);
+               fmt->free = hpp_free;
+               fmt->level = level;
+       }
+
+       return fmt;
+}
+
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
+{
+       struct perf_hpp_fmt *fmt;
+       struct hpp_sort_entry *hse;
+       int ret = -1;
+       int r;
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               if (!perf_hpp__is_sort_entry(fmt))
+                       continue;
+
+               hse = container_of(fmt, struct hpp_sort_entry, hpp);
+               if (hse->se->se_filter == NULL)
+                       continue;
+
+               /*
+                * hist entry is filtered if any of sort key in the hpp list
+                * is applied.  But it should skip non-matched filter types.
+                */
+               r = hse->se->se_filter(he, type, arg);
+               if (r >= 0) {
+                       if (ret < 0)
+                               ret = 0;
+                       ret |= r;
+               }
+       }
+
+       return ret;
  }
  
-static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd,
+                                         struct perf_hpp_list *list,
+                                         int level)
  {
-       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, level);
  
         if (hse == NULL)
                 return -1;
  
-       perf_hpp__register_sort_field(&hse->hpp);
+       perf_hpp_list__register_sort_field(list, &hse->hpp);
         return 0;
  }
  
-static int __sort_dimension__add_hpp_output(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_output(struct sort_dimension *sd,
+                                           struct perf_hpp_list *list)
  {
-       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, 0);
  
         if (hse == NULL)
                 return -1;
  
-       perf_hpp__column_register(&hse->hpp);
+       perf_hpp_list__column_register(list, &hse->hpp);
         return 0;
  }
  
@@ -1727,6 +1772,9 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
         if (hde->raw_trace)
                 goto raw_field;
  
+       if (!he->trace_output)
+               he->trace_output = get_trace_output(he);
+
         field = hde->field;
         namelen = strlen(field->name);
         str = he->trace_output;
@@ -1776,6 +1824,11 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
  
         hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
  
+       if (b == NULL) {
+               update_dynamic_len(hde, a);
+               return 0;
+       }
+
         field = hde->field;
         if (field->flags & FIELD_IS_DYNAMIC) {
                 unsigned long long dyn;
@@ -1790,9 +1843,6 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
         } else {
                 offset = field->offset;
                 size = field->size;
-
-               update_dynamic_len(hde, a);
-               update_dynamic_len(hde, b);
         }
  
         return memcmp(a->raw_data + offset, b->raw_data + offset, size);
@@ -1803,8 +1853,31 @@ bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
         return fmt->cmp == __sort__hde_cmp;
  }
  
+static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       struct hpp_dynamic_entry *hde_a;
+       struct hpp_dynamic_entry *hde_b;
+
+       if (!perf_hpp__is_dynamic_entry(a) || !perf_hpp__is_dynamic_entry(b))
+               return false;
+
+       hde_a = container_of(a, struct hpp_dynamic_entry, hpp);
+       hde_b = container_of(b, struct hpp_dynamic_entry, hpp);
+
+       return hde_a->field == hde_b->field;
+}
+
+static void hde_free(struct perf_hpp_fmt *fmt)
+{
+       struct hpp_dynamic_entry *hde;
+
+       hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+       free(hde);
+}
+
  static struct hpp_dynamic_entry *
-__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
+__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field,
+                     int level)
  {
         struct hpp_dynamic_entry *hde;
  
@@ -1827,16 +1900,47 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
         hde->hpp.cmp = __sort__hde_cmp;
         hde->hpp.collapse = __sort__hde_cmp;
         hde->hpp.sort = __sort__hde_cmp;
+       hde->hpp.equal = __sort__hde_equal;
+       hde->hpp.free = hde_free;
  
         INIT_LIST_HEAD(&hde->hpp.list);
         INIT_LIST_HEAD(&hde->hpp.sort_list);
         hde->hpp.elide = false;
         hde->hpp.len = 0;
         hde->hpp.user_len = 0;
+       hde->hpp.level = level;
  
         return hde;
  }
  
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
+{
+       struct perf_hpp_fmt *new_fmt = NULL;
+
+       if (perf_hpp__is_sort_entry(fmt)) {
+               struct hpp_sort_entry *hse, *new_hse;
+
+               hse = container_of(fmt, struct hpp_sort_entry, hpp);
+               new_hse = memdup(hse, sizeof(*hse));
+               if (new_hse)
+                       new_fmt = &new_hse->hpp;
+       } else if (perf_hpp__is_dynamic_entry(fmt)) {
+               struct hpp_dynamic_entry *hde, *new_hde;
+
+               hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+               new_hde = memdup(hde, sizeof(*hde));
+               if (new_hde)
+                       new_fmt = &new_hde->hpp;
+       } else {
+               new_fmt = memdup(fmt, sizeof(*fmt));
+       }
+
+       INIT_LIST_HEAD(&new_fmt->list);
+       INIT_LIST_HEAD(&new_fmt->sort_list);
+
+       return new_fmt;
+}
+
  static int parse_field_name(char *str, char **event, char **field, char **opt)
  {
         char *event_name, *field_name, *opt_name;
@@ -1908,11 +2012,11 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
  
  static int __dynamic_dimension__add(struct perf_evsel *evsel,
                                     struct format_field *field,
-                                   bool raw_trace)
+                                   bool raw_trace, int level)
  {
         struct hpp_dynamic_entry *hde;
  
-       hde = __alloc_dynamic_entry(evsel, field);
+       hde = __alloc_dynamic_entry(evsel, field, level);
         if (hde == NULL)
                 return -ENOMEM;
  
@@ -1922,14 +2026,14 @@ static int __dynamic_dimension__add(struct perf_evsel *evsel,
         return 0;
  }
  
-static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
+static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace, int level)
  {
         int ret;
         struct format_field *field;
  
         field = evsel->tp_format->format.fields;
         while (field) {
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
                 if (ret < 0)
                         return ret;
  
@@ -1938,7 +2042,8 @@ static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
         return 0;
  }
  
-static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
+static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace,
+                                 int level)
  {
         int ret;
         struct perf_evsel *evsel;
@@ -1947,7 +2052,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
                 if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
                         continue;
  
-               ret = add_evsel_fields(evsel, raw_trace);
+               ret = add_evsel_fields(evsel, raw_trace, level);
                 if (ret < 0)
                         return ret;
         }
@@ -1955,7 +2060,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
  }
  
  static int add_all_matching_fields(struct perf_evlist *evlist,
-                                  char *field_name, bool raw_trace)
+                                  char *field_name, bool raw_trace, int level)
  {
         int ret = -ESRCH;
         struct perf_evsel *evsel;
@@ -1969,14 +2074,15 @@ static int add_all_matching_fields(struct perf_evlist *evlist,
                 if (field == NULL)
                         continue;
  
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
                 if (ret < 0)
                         break;
         }
         return ret;
  }
  
-static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
+static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok,
+                            int level)
  {
         char *str, *event_name, *field_name, *opt_name;
         struct perf_evsel *evsel;
@@ -2006,12 +2112,12 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
         }
  
         if (!strcmp(field_name, "trace_fields")) {
-               ret = add_all_dynamic_fields(evlist, raw_trace);
+               ret = add_all_dynamic_fields(evlist, raw_trace, level);
                 goto out;
         }
  
         if (event_name == NULL) {
-               ret = add_all_matching_fields(evlist, field_name, raw_trace);
+               ret = add_all_matching_fields(evlist, field_name, raw_trace, level);
                 goto out;
         }
  
@@ -2029,7 +2135,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
         }
  
         if (!strcmp(field_name, "*")) {
-               ret = add_evsel_fields(evsel, raw_trace);
+               ret = add_evsel_fields(evsel, raw_trace, level);
         } else {
                 field = pevent_find_any_field(evsel->tp_format, field_name);
                 if (field == NULL) {
@@ -2038,7 +2144,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
                         return -ENOENT;
                 }
  
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
         }
  
  out:
@@ -2046,12 +2152,14 @@ out:
         return ret;
  }
  
-static int __sort_dimension__add(struct sort_dimension *sd)
+static int __sort_dimension__add(struct sort_dimension *sd,
+                                struct perf_hpp_list *list,
+                                int level)
  {
         if (sd->taken)
                 return 0;
  
-       if (__sort_dimension__add_hpp_sort(sd) < 0)
+       if (__sort_dimension__add_hpp_sort(sd, list, level) < 0)
                 return -1;
  
         if (sd->entry->se_collapse)
@@ -2062,46 +2170,63 @@ static int __sort_dimension__add(struct sort_dimension *sd)
         return 0;
  }
  
-static int __hpp_dimension__add(struct hpp_dimension *hd)
+static int __hpp_dimension__add(struct hpp_dimension *hd,
+                               struct perf_hpp_list *list,
+                               int level)
  {
-       if (!hd->taken) {
-               hd->taken = 1;
+       struct perf_hpp_fmt *fmt;
  
-               perf_hpp__register_sort_field(hd->fmt);
-       }
+       if (hd->taken)
+               return 0;
+
+       fmt = __hpp_dimension__alloc_hpp(hd, level);
+       if (!fmt)
+               return -1;
+
+       hd->taken = 1;
+       perf_hpp_list__register_sort_field(list, fmt);
         return 0;
  }
  
-static int __sort_dimension__add_output(struct sort_dimension *sd)
+static int __sort_dimension__add_output(struct perf_hpp_list *list,
+                                       struct sort_dimension *sd)
  {
         if (sd->taken)
                 return 0;
  
-       if (__sort_dimension__add_hpp_output(sd) < 0)
+       if (__sort_dimension__add_hpp_output(sd, list) < 0)
                 return -1;
  
         sd->taken = 1;
         return 0;
  }
  
-static int __hpp_dimension__add_output(struct hpp_dimension *hd)
+static int __hpp_dimension__add_output(struct perf_hpp_list *list,
+                                      struct hpp_dimension *hd)
  {
-       if (!hd->taken) {
-               hd->taken = 1;
+       struct perf_hpp_fmt *fmt;
  
-               perf_hpp__column_register(hd->fmt);
-       }
+       if (hd->taken)
+               return 0;
+
+       fmt = __hpp_dimension__alloc_hpp(hd, 0);
+       if (!fmt)
+               return -1;
+
+       hd->taken = 1;
+       perf_hpp_list__column_register(list, fmt);
         return 0;
  }
  
  int hpp_dimension__add_output(unsigned col)
  {
         BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       return __hpp_dimension__add_output(&hpp_sort_dimensions[col]);
+       return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
  }
  
-static int sort_dimension__add(const char *tok,
-                              struct perf_evlist *evlist __maybe_unused)
+static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
+                              struct perf_evlist *evlist __maybe_unused,
+                              int level)
  {
         unsigned int i;
  
@@ -2136,9 +2261,13 @@ static int sort_dimension__add(const char *tok,
                         sort__has_dso = 1;
                 } else if (sd->entry == &sort_socket) {
                         sort__has_socket = 1;
+               } else if (sd->entry == &sort_thread) {
+                       sort__has_thread = 1;
+               } else if (sd->entry == &sort_comm) {
+                       sort__has_comm = 1;
                 }
  
-               return __sort_dimension__add(sd);
+               return __sort_dimension__add(sd, list, level);
         }
  
         for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2147,7 +2276,7 @@ static int sort_dimension__add(const char *tok,
                 if (strncasecmp(tok, hd->name, strlen(tok)))
                         continue;
  
-               return __hpp_dimension__add(hd);
+               return __hpp_dimension__add(hd, list, level);
         }
  
         for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2162,7 +2291,7 @@ static int sort_dimension__add(const char *tok,
                 if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
                         sort__has_sym = 1;
  
-               __sort_dimension__add(sd);
+               __sort_dimension__add(sd, list, level);
                 return 0;
         }
  
@@ -2178,16 +2307,60 @@ static int sort_dimension__add(const char *tok,
                 if (sd->entry == &sort_mem_daddr_sym)
                         sort__has_sym = 1;
  
-               __sort_dimension__add(sd);
+               __sort_dimension__add(sd, list, level);
                 return 0;
         }
  
-       if (!add_dynamic_entry(evlist, tok))
+       if (!add_dynamic_entry(evlist, tok, level))
                 return 0;
  
         return -ESRCH;
  }
  
+static int setup_sort_list(struct perf_hpp_list *list, char *str,
+                          struct perf_evlist *evlist)
+{
+       char *tmp, *tok;
+       int ret = 0;
+       int level = 0;
+       int next_level = 1;
+       bool in_group = false;
+
+       do {
+               tok = str;
+               tmp = strpbrk(str, "{}, ");
+               if (tmp) {
+                       if (in_group)
+                               next_level = level;
+                       else
+                               next_level = level + 1;
+
+                       if (*tmp == '{')
+                               in_group = true;
+                       else if (*tmp == '}')
+                               in_group = false;
+
+                       *tmp = '\0';
+                       str = tmp + 1;
+               }
+
+               if (*tok) {
+                       ret = sort_dimension__add(list, tok, evlist, level);
+                       if (ret == -EINVAL) {
+                               error("Invalid --sort key: `%s'", tok);
+                               break;
+                       } else if (ret == -ESRCH) {
+                               error("Unknown --sort key: `%s'", tok);
+                               break;
+                       }
+               }
+
+               level = next_level;
+       } while (tmp);
+
+       return ret;
+}
+
  static const char *get_default_sort_order(struct perf_evlist *evlist)
  {
         const char *default_sort_orders[] = {
@@ -2282,7 +2455,7 @@ static char *setup_overhead(char *keys)
  
  static int __setup_sorting(struct perf_evlist *evlist)
  {
-       char *tmp, *tok, *str;
+       char *str;
         const char *sort_keys;
         int ret = 0;
  
@@ -2320,17 +2493,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
                 }
         }
  
-       for (tok = strtok_r(str, ", ", &tmp);
-                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
-               ret = sort_dimension__add(tok, evlist);
-               if (ret == -EINVAL) {
-                       error("Invalid --sort key: `%s'", tok);
-                       break;
-               } else if (ret == -ESRCH) {
-                       error("Unknown --sort key: `%s'", tok);
-                       break;
-               }
-       }
+       ret = setup_sort_list(&perf_hpp_list, str, evlist);
  
         free(str);
         return ret;
@@ -2341,7 +2504,7 @@ void perf_hpp__set_elide(int idx, bool elide)
         struct perf_hpp_fmt *fmt;
         struct hpp_sort_entry *hse;
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2401,7 +2564,7 @@ void sort__setup_elide(FILE *output)
         struct perf_hpp_fmt *fmt;
         struct hpp_sort_entry *hse;
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2413,7 +2576,7 @@ void sort__setup_elide(FILE *output)
          * It makes no sense to elide all of sort entries.
          * Just revert them to show up again.
          */
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2421,7 +2584,7 @@ void sort__setup_elide(FILE *output)
                         return;
         }
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2429,7 +2592,7 @@ void sort__setup_elide(FILE *output)
         }
  }
  
-static int output_field_add(char *tok)
+static int output_field_add(struct perf_hpp_list *list, char *tok)
  {
         unsigned int i;
  
@@ -2439,7 +2602,7 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, sd->name, strlen(tok)))
                         continue;
  
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
         }
  
         for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2448,7 +2611,7 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, hd->name, strlen(tok)))
                         continue;
  
-               return __hpp_dimension__add_output(hd);
+               return __hpp_dimension__add_output(list, hd);
         }
  
         for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2457,7 +2620,7 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, sd->name, strlen(tok)))
                         continue;
  
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
         }
  
         for (i = 0; i < ARRAY_SIZE(memory_sort_dimensions); i++) {
@@ -2466,12 +2629,32 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, sd->name, strlen(tok)))
                         continue;
  
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
         }
  
         return -ESRCH;
  }
  
+static int setup_output_list(struct perf_hpp_list *list, char *str)
+{
+       char *tmp, *tok;
+       int ret = 0;
+
+       for (tok = strtok_r(str, ", ", &tmp);
+                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
+               ret = output_field_add(list, tok);
+               if (ret == -EINVAL) {
+                       error("Invalid --fields key: `%s'", tok);
+                       break;
+               } else if (ret == -ESRCH) {
+                       error("Unknown --fields key: `%s'", tok);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
  static void reset_dimensions(void)
  {
         unsigned int i;
@@ -2496,7 +2679,7 @@ bool is_strict_order(const char *order)
  
  static int __setup_output_field(void)
  {
-       char *tmp, *tok, *str, *strp;
+       char *str, *strp;
         int ret = -EINVAL;
  
         if (field_order == NULL)
@@ -2516,17 +2699,7 @@ static int __setup_output_field(void)
                 goto out;
         }
  
-       for (tok = strtok_r(strp, ", ", &tmp);
-                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
-               ret = output_field_add(tok);
-               if (ret == -EINVAL) {
-                       error("Invalid --fields key: `%s'", tok);
-                       break;
-               } else if (ret == -ESRCH) {
-                       error("Unknown --fields key: `%s'", tok);
-                       break;
-               }
-       }
+       ret = setup_output_list(&perf_hpp_list, strp);
  
  out:
         free(str);
@@ -2542,7 +2715,7 @@ int setup_sorting(struct perf_evlist *evlist)
                 return err;
  
         if (parent_pattern != default_parent_pattern) {
-               err = sort_dimension__add("parent", evlist);
+               err = sort_dimension__add(&perf_hpp_list, "parent", evlist, -1);
                 if (err < 0)
                         return err;
         }
@@ -2560,9 +2733,13 @@ int setup_sorting(struct perf_evlist *evlist)
                 return err;
  
         /* copy sort keys to output fields */
-       perf_hpp__setup_output_field();
+       perf_hpp__setup_output_field(&perf_hpp_list);
         /* and then copy output fields to sort keys */
-       perf_hpp__append_sort_keys();
+       perf_hpp__append_sort_keys(&perf_hpp_list);
+
+       /* setup hists-specific output fields */
+       if (perf_hpp__setup_hists_formats(&perf_hpp_list, evlist) < 0)
+               return -1;
  
         return 0;
  }
@@ -2578,5 +2755,5 @@ void reset_output_field(void)
         sort_order = NULL;
  
         reset_dimensions();
-       perf_hpp__reset_output_field();
+       perf_hpp__reset_output_field(&perf_hpp_list);
  }
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 687bbb1244281ba65a99c3cb78f5bcb8a1eaa41b..3f4e359981192ac50b56e770aa472749666e1f98 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -32,9 +32,12 @@ extern const char default_sort_order[];
  extern regex_t ignore_callees_regex;
  extern int have_ignore_callees;
  extern int sort__need_collapse;
+extern int sort__has_dso;
  extern int sort__has_parent;
  extern int sort__has_sym;
  extern int sort__has_socket;
+extern int sort__has_thread;
+extern int sort__has_comm;
  extern enum sort_mode sort__mode;
  extern struct sort_entry sort_comm;
  extern struct sort_entry sort_dso;
@@ -94,9 +97,11 @@ struct hist_entry {
         s32                     socket;
         s32                     cpu;
         u8                      cpumode;
+       u8                      depth;
  
         /* We are added by hists__add_dummy_entry. */
         bool                    dummy;
+       bool                    leaf;
  
         char                    level;
         u8                      filtered;
@@ -113,18 +118,28 @@ struct hist_entry {
                         bool    init_have_children;
                         bool    unfolded;
                         bool    has_children;
+                       bool    has_no_entry;
                 };
         };
         char                    *srcline;
         char                    *srcfile;
         struct symbol           *parent;
-       struct rb_root          sorted_chain;
         struct branch_info      *branch_info;
         struct hists            *hists;
         struct mem_info         *mem_info;
         void                    *raw_data;
         u32                     raw_size;
         void                    *trace_output;
+       struct perf_hpp_list    *hpp_list;
+       struct hist_entry       *parent_he;
+       union {
+               /* this is for hierarchical entry structure */
+               struct {
+                       struct rb_root  hroot_in;
+                       struct rb_root  hroot_out;
+               };                              /* non-leaf entries */
+               struct rb_root  sorted_chain;   /* leaf entry has callchains */
+       };
         struct callchain_root   callchain[0]; /* must be last member */
  };
  
@@ -160,6 +175,17 @@ static inline float hist_entry__get_percent_limit(struct hist_entry *he)
         return period * 100.0 / total_period;
  }
  
+static inline u64 cl_address(u64 address)
+{
+       /* return the cacheline of the address */
+       return (address & ~(cacheline_size - 1));
+}
+
+static inline u64 cl_offset(u64 address)
+{
+       /* return the cacheline of the address */
+       return (address & (cacheline_size - 1));
+}
  
  enum sort_mode {
         SORT_MODE__NORMAL,
@@ -221,6 +247,7 @@ struct sort_entry {
         int64_t (*se_sort)(struct hist_entry *, struct hist_entry *);
         int     (*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
                                unsigned int width);
+       int     (*se_filter)(struct hist_entry *he, int type, const void *arg);
         u8      se_width_idx;
  };
  
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c

index 6ac03146889d29be60707efd6e04c27c919f64de..b33ffb2af2cf5900378fb9fb8ef60e9eddaaf9db 100644 (file)
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -2,6 +2,7 @@
  #include "evsel.h"
  #include "stat.h"
  #include "color.h"
+#include "pmu.h"
  
  enum {
         CTX_BIT_USER    = 1 << 0,
@@ -14,6 +15,13 @@ enum {
  
  #define NUM_CTX CTX_BIT_MAX
  
+/*
+ * AGGR_GLOBAL: Use CPU 0
+ * AGGR_SOCKET: Use first CPU of socket
+ * AGGR_CORE: Use first CPU of core
+ * AGGR_NONE: Use matching CPU
+ * AGGR_THREAD: Not supported?
+ */
  static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
  static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
@@ -28,9 +36,15 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static bool have_frontend_stalled;
  
  struct stats walltime_nsecs_stats;
  
+void perf_stat__init_shadow_stats(void)
+{
+       have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
+}
+
  static int evsel_context(struct perf_evsel *evsel)
  {
         int ctx = 0;
@@ -137,9 +151,10 @@ static const char *get_ratio_color(enum grc_type type, double ratio)
         return color;
  }
  
-static void print_stalled_cycles_frontend(FILE *out, int cpu,
+static void print_stalled_cycles_frontend(int cpu,
                                           struct perf_evsel *evsel
-                                         __maybe_unused, double avg)
+                                         __maybe_unused, double avg,
+                                         struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -152,14 +167,17 @@ static void print_stalled_cycles_frontend(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " frontend cycles idle   ");
+       if (ratio)
+               out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle",
+                                 ratio);
+       else
+               out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0);
  }
  
-static void print_stalled_cycles_backend(FILE *out, int cpu,
+static void print_stalled_cycles_backend(int cpu,
                                          struct perf_evsel *evsel
-                                        __maybe_unused, double avg)
+                                        __maybe_unused, double avg,
+                                        struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -172,14 +190,13 @@ static void print_stalled_cycles_backend(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " backend  cycles idle   ");
+       out->print_metric(out->ctx, color, "%6.2f%%", "backend cycles idle", ratio);
  }
  
-static void print_branch_misses(FILE *out, int cpu,
+static void print_branch_misses(int cpu,
                                 struct perf_evsel *evsel __maybe_unused,
-                               double avg)
+                               double avg,
+                               struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -192,14 +209,13 @@ static void print_branch_misses(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all branches        ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio);
  }
  
-static void print_l1_dcache_misses(FILE *out, int cpu,
+static void print_l1_dcache_misses(int cpu,
                                    struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
+                                  double avg,
+                                  struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -212,14 +228,13 @@ static void print_l1_dcache_misses(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all L1-dcache hits  ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
  }
  
-static void print_l1_icache_misses(FILE *out, int cpu,
+static void print_l1_icache_misses(int cpu,
                                    struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
+                                  double avg,
+                                  struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -231,15 +246,13 @@ static void print_l1_icache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all L1-icache hits  ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
  }
  
-static void print_dtlb_cache_misses(FILE *out, int cpu,
+static void print_dtlb_cache_misses(int cpu,
                                     struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
+                                   double avg,
+                                   struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -251,15 +264,13 @@ static void print_dtlb_cache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all dTLB cache hits ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
  }
  
-static void print_itlb_cache_misses(FILE *out, int cpu,
+static void print_itlb_cache_misses(int cpu,
                                     struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
+                                   double avg,
+                                   struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -271,15 +282,13 @@ static void print_itlb_cache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all iTLB cache hits ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
  }
  
-static void print_ll_cache_misses(FILE *out, int cpu,
+static void print_ll_cache_misses(int cpu,
                                   struct perf_evsel *evsel __maybe_unused,
-                                 double avg)
+                                 double avg,
+                                 struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -291,15 +300,15 @@ static void print_ll_cache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all LL-cache hits   ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
  }
  
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-                                  double avg, int cpu, enum aggr_mode aggr)
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+                                  double avg, int cpu,
+                                  struct perf_stat_output_ctx *out)
  {
+       void *ctxp = out->ctx;
+       print_metric_t print_metric = out->print_metric;
         double total, ratio = 0.0, total2;
         int ctx = evsel_context(evsel);
  
@@ -307,119 +316,145 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                 if (total) {
                         ratio = avg / total;
-                       fprintf(out, " #   %5.2f  insns per cycle        ", ratio);
+                       print_metric(ctxp, NULL, "%7.2f ",
+                                       "insn per cycle", ratio);
                 } else {
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
                 }
                 total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
                 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
  
                 if (total && avg) {
+                       out->new_line(ctxp);
                         ratio = total / avg;
-                       fprintf(out, "\n");
-                       if (aggr == AGGR_NONE)
-                               fprintf(out, "        ");
-                       fprintf(out, "                                                  #   %5.2f  stalled cycles per insn", ratio);
+                       print_metric(ctxp, NULL, "%7.2f ",
+                                       "stalled cycles per insn",
+                                       ratio);
+               } else if (have_frontend_stalled) {
+                       print_metric(ctxp, NULL, NULL,
+                                    "stalled cycles per insn", 0);
                 }
-
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-                       runtime_branches_stats[ctx][cpu].n != 0) {
-               print_branch_misses(out, cpu, evsel, avg);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
+               if (runtime_branches_stats[ctx][cpu].n != 0)
+                       print_branch_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all branches", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_dcache_stats[ctx][cpu].n != 0) {
-               print_l1_dcache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_l1_dcache_stats[ctx][cpu].n != 0)
+                       print_l1_dcache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_icache_stats[ctx][cpu].n != 0) {
-               print_l1_icache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_l1_icache_stats[ctx][cpu].n != 0)
+                       print_l1_icache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
-               print_dtlb_cache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_dtlb_cache_stats[ctx][cpu].n != 0)
+                       print_dtlb_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_itlb_cache_stats[ctx][cpu].n != 0) {
-               print_itlb_cache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_itlb_cache_stats[ctx][cpu].n != 0)
+                       print_itlb_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_ll_cache_stats[ctx][cpu].n != 0) {
-               print_ll_cache_misses(out, cpu, evsel, avg);
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-                       runtime_cacherefs_stats[ctx][cpu].n != 0) {
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_ll_cache_stats[ctx][cpu].n != 0)
+                       print_ll_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
                 total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
  
                 if (total)
                         ratio = avg * 100 / total;
  
-               fprintf(out, " # %8.3f %% of all cache refs    ", ratio);
-
+               if (runtime_cacherefs_stats[ctx][cpu].n != 0)
+                       print_metric(ctxp, NULL, "%8.3f %%",
+                                    "of all cache refs", ratio);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-               print_stalled_cycles_frontend(out, cpu, evsel, avg);
+               print_stalled_cycles_frontend(cpu, evsel, avg, out);
         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-               print_stalled_cycles_backend(out, cpu, evsel, avg);
+               print_stalled_cycles_backend(cpu, evsel, avg, out);
         } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
                 total = avg_stats(&runtime_nsecs_stats[cpu]);
  
                 if (total) {
                         ratio = avg / total;
-                       fprintf(out, " # %8.3f GHz                    ", ratio);
+                       print_metric(ctxp, NULL, "%8.3f", "GHz", ratio);
                 } else {
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "Ghz", 0);
                 }
         } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                 if (total)
-                       fprintf(out,
-                               " #   %5.2f%% transactional cycles   ",
-                               100.0 * (avg / total));
+                       print_metric(ctxp, NULL,
+                                       "%7.2f%%", "transactional cycles",
+                                       100.0 * (avg / total));
+               else
+                       print_metric(ctxp, NULL, NULL, "transactional cycles",
+                                    0);
         } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                 total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
                 if (total2 < avg)
                         total2 = avg;
                 if (total)
-                       fprintf(out,
-                               " #   %5.2f%% aborted cycles         ",
+                       print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles",
                                 100.0 * ((total2-avg) / total));
-       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START) &&
-                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               else
+                       print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
+       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
                 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
  
                 if (avg)
                         ratio = total / avg;
  
-               fprintf(out, " # %8.0f cycles / transaction   ", ratio);
-       } else if (perf_stat_evsel__is(evsel, ELISION_START) &&
-                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0)
+                       print_metric(ctxp, NULL, "%8.0f",
+                                    "cycles / transaction", ratio);
+               else
+                       print_metric(ctxp, NULL, NULL, "cycles / transaction",
+                                    0);
+       } else if (perf_stat_evsel__is(evsel, ELISION_START)) {
                 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
  
                 if (avg)
                         ratio = total / avg;
  
-               fprintf(out, " # %8.0f cycles / elision       ", ratio);
+               print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
         } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) {
                 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
-                       fprintf(out, " # %8.3f CPUs utilized          ", avg / ratio);
+                       print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
+                                    avg / ratio);
                 else
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
         } else if (runtime_nsecs_stats[cpu].n != 0) {
                 char unit = 'M';
+               char unit_buf[10];
  
                 total = avg_stats(&runtime_nsecs_stats[cpu]);
  
@@ -429,9 +464,9 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
                         ratio *= 1000;
                         unit = 'K';
                 }
-
-               fprintf(out, " # %8.3f %c/sec                  ", ratio, unit);
+               snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+               print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
         } else {
-               fprintf(out, "                                   ");
+               print_metric(ctxp, NULL, NULL, NULL, 0);
         }
  }
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c

index afb0c45eba34ba8db7a6cb6d258326f545f7aca1..4d9b481cf3b6edbb7d6161cbd238709241dedc5b 100644 (file)
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -97,7 +97,7 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel)
         }
  }
  
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
  {
         int i;
         struct perf_stat_evsel *ps = evsel->priv;
@@ -108,7 +108,7 @@ void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
         perf_stat_evsel_id_init(evsel);
  }
  
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
  {
         evsel->priv = zalloc(sizeof(struct perf_stat_evsel));
         if (evsel->priv == NULL)
@@ -117,13 +117,13 @@ int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
         return 0;
  }
  
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
  {
         zfree(&evsel->priv);
  }
  
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-                                     int ncpus, int nthreads)
+static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
+                                            int ncpus, int nthreads)
  {
         struct perf_counts *counts;
  
@@ -134,13 +134,13 @@ int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
         return counts ? 0 : -ENOMEM;
  }
  
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
+static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
  {
         perf_counts__delete(evsel->prev_raw_counts);
         evsel->prev_raw_counts = NULL;
  }
  
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
+static int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
  {
         int ncpus = perf_evsel__nr_cpus(evsel);
         int nthreads = thread_map__nr(evsel->threads);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h

index 086f4e128d6351f0e0de862c2ebcc44801cbc2f8..0150e786ccc7c1f48e407323b606f47155a076c9 100644 (file)
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -68,21 +68,23 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel);
  
  extern struct stats walltime_nsecs_stats;
  
+typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
+                              const char *fmt, double val);
+typedef void (*new_line_t )(void *ctx);
+
+void perf_stat__init_shadow_stats(void);
  void perf_stat__reset_shadow_stats(void);
  void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
                                     int cpu);
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-                                  double avg, int cpu, enum aggr_mode aggr);
-
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel);
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel);
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel);
-
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-                                     int ncpus, int nthreads);
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel);
+struct perf_stat_output_ctx {
+       void *ctx;
+       print_metric_t print_metric;
+       new_line_t new_line;
+};
  
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw);
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+                                  double avg, int cpu,
+                                  struct perf_stat_output_ctx *out);
  
  int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
  void perf_evlist__free_stats(struct perf_evlist *evlist);
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c

index 25671fa166188413c66758a978082e89a1ba08d2..d3d279275432ee663641a1a6dbb0a0cf8d3b8334 100644 (file)
--- a/tools/perf/util/strbuf.c
+++ b/tools/perf/util/strbuf.c
@@ -51,30 +51,6 @@ void strbuf_grow(struct strbuf *sb, size_t extra)
         ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
  }
  
-static void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
-                                  const void *data, size_t dlen)
-{
-       if (pos + len < pos)
-               die("you want to use way too much memory");
-       if (pos > sb->len)
-               die("`pos' is too far after the end of the buffer");
-       if (pos + len > sb->len)
-               die("`pos + len' is too far after the end of the buffer");
-
-       if (dlen >= len)
-               strbuf_grow(sb, dlen - len);
-       memmove(sb->buf + pos + dlen,
-                       sb->buf + pos + len,
-                       sb->len - pos - len);
-       memcpy(sb->buf + pos, data, dlen);
-       strbuf_setlen(sb, sb->len + dlen - len);
-}
-
-void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
-{
-       strbuf_splice(sb, pos, len, NULL, 0);
-}
-
  void strbuf_add(struct strbuf *sb, const void *data, size_t len)
  {
         strbuf_grow(sb, len);
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h

index 529f2f03524915ab9cae7c5608de444fd875812d..7a32c838884d8de6feaa3223e388199c5c939301 100644 (file)
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -77,8 +77,6 @@ static inline void strbuf_addch(struct strbuf *sb, int c) {
         sb->buf[sb->len] = '\0';
  }
  
-extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
-
  extern void strbuf_add(struct strbuf *, const void *, size_t);
  static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
         strbuf_add(sb, s, strlen(s));
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c

index 562b8ebeae5b2414b6bac867c928cb22ee9a5f13..b1dd68f358fcd8e7390b5929ed736a357c7853c1 100644 (file)
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -6,6 +6,7 @@
  #include <inttypes.h>
  
  #include "symbol.h"
+#include "demangle-java.h"
  #include "machine.h"
  #include "vdso.h"
  #include <symbol/kallsyms.h>
@@ -1077,6 +1078,8 @@ new_symbol:
                                 demangle_flags = DMGL_PARAMS | DMGL_ANSI;
  
                         demangled = bfd_demangle(NULL, elf_name, demangle_flags);
+                       if (demangled == NULL)
+                               demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
                         if (demangled != NULL)
                                 elf_name = demangled;
                 }
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index ab02209a7cf3b162431bfc006287f5fbd8c68f43..e7588dc915181729394c1d2195c78576c47d20df 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1466,7 +1466,8 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
          * Read the build id if possible. This is required for
          * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work
          */
-       if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
+       if (is_regular_file(name) &&
+           filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
                 dso__set_build_id(dso, build_id);
  
         /*
@@ -1487,6 +1488,9 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                                                    root_dir, name, PATH_MAX))
                         continue;
  
+               if (!is_regular_file(name))
+                       continue;
+
                 /* Name is now the name of the next image to try */
                 if (symsrc__init(ss, dso, name, symtab_type) < 0)
                         continue;
@@ -1525,6 +1529,10 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
         if (!runtime_ss && syms_ss)
                 runtime_ss = syms_ss;
  
+       if (syms_ss && syms_ss->type == DSO_BINARY_TYPE__BUILD_ID_CACHE)
+               if (dso__build_id_is_kmod(dso, name, PATH_MAX))
+                       kmod = true;
+
         if (syms_ss)
                 ret = dso__load_sym(dso, map, syms_ss, runtime_ss, filter, kmod);
         else
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index ccd1caa40e116645be37025665388886d7c97546..a937053a0ae07a6214fb03753faa66819ed40884 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -110,7 +110,8 @@ struct symbol_conf {
                         has_filter,
                         show_ref_callgraph,
                         hide_unresolved,
-                       raw_trace;
+                       raw_trace,
+                       report_hierarchy;
         const char      *vmlinux_name,
                         *kallsyms_name,
                         *source_prefix,
diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c

index 802bb868d446cafa7c5383982193ad13d87b785a..8ae051e0ec79090e674fdf9a6cc9fbece8275ebb 100644 (file)
--- a/tools/perf/util/trace-event.c
+++ b/tools/perf/util/trace-event.c
@@ -10,6 +10,7 @@
  #include <linux/err.h>
  #include <traceevent/event-parse.h>
  #include <api/fs/tracing_path.h>
+#include <api/fs/fs.h>
  #include "trace-event.h"
  #include "machine.h"
  #include "util.h"
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c

index 4d4210d4e13d1d7fbc32a5c1f4afff58a0810fd0..1b741646eed00b7754ba0a618b4fb57b8c87d9ac 100644 (file)
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -19,7 +19,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc)
         u64 quot, rem;
  
         quot = cyc >> tc->time_shift;
-       rem  = cyc & ((1 << tc->time_shift) - 1);
+       rem  = cyc & (((u64)1 << tc->time_shift) - 1);
         return tc->time_zero + quot * tc->time_mult +
                ((rem * tc->time_mult) >> tc->time_shift);
  }
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c

index ead9509835d23e2aee29b9f3548613973a5937fb..b7766c577b015d978fd3e9960c451692f81daa6e 100644 (file)
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -14,6 +14,7 @@
  #include <limits.h>
  #include <byteswap.h>
  #include <linux/kernel.h>
+#include <linux/log2.h>
  #include <unistd.h>
  #include "callchain.h"
  #include "strlist.h"
@@ -507,54 +508,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
         return ret;
  }
  
-int filename__read_str(const char *filename, char **buf, size_t *sizep)
-{
-       size_t size = 0, alloc_size = 0;
-       void *bf = NULL, *nbf;
-       int fd, n, err = 0;
-       char sbuf[STRERR_BUFSIZE];
-
-       fd = open(filename, O_RDONLY);
-       if (fd < 0)
-               return -errno;
-
-       do {
-               if (size == alloc_size) {
-                       alloc_size += BUFSIZ;
-                       nbf = realloc(bf, alloc_size);
-                       if (!nbf) {
-                               err = -ENOMEM;
-                               break;
-                       }
-
-                       bf = nbf;
-               }
-
-               n = read(fd, bf + size, alloc_size - size);
-               if (n < 0) {
-                       if (size) {
-                               pr_warning("read failed %d: %s\n", errno,
-                                        strerror_r(errno, sbuf, sizeof(sbuf)));
-                               err = 0;
-                       } else
-                               err = -errno;
-
-                       break;
-               }
-
-               size += n;
-       } while (n > 0);
-
-       if (!err) {
-               *sizep = size;
-               *buf   = bf;
-       } else
-               free(bf);
-
-       close(fd);
-       return err;
-}
-
  const char *get_filename_for_perf_kvm(void)
  {
         const char *filename;
@@ -691,3 +644,66 @@ out:
  
         return tip;
  }
+
+bool is_regular_file(const char *file)
+{
+       struct stat st;
+
+       if (stat(file, &st))
+               return false;
+
+       return S_ISREG(st.st_mode);
+}
+
+int fetch_current_timestamp(char *buf, size_t sz)
+{
+       struct timeval tv;
+       struct tm tm;
+       char dt[32];
+
+       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+               return -1;
+
+       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+               return -1;
+
+       scnprintf(buf, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+       return 0;
+}
+
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra)
+{
+       size_t i, j, mask;
+
+       if (!printer)
+               return;
+
+       bytes_per_line = roundup_pow_of_two(bytes_per_line);
+       mask = bytes_per_line - 1;
+
+       printer(BINARY_PRINT_DATA_BEGIN, 0, extra);
+       for (i = 0; i < len; i++) {
+               if ((i & mask) == 0) {
+                       printer(BINARY_PRINT_LINE_BEGIN, -1, extra);
+                       printer(BINARY_PRINT_ADDR, i, extra);
+               }
+
+               printer(BINARY_PRINT_NUM_DATA, data[i], extra);
+
+               if (((i & mask) == mask) || i == len - 1) {
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_NUM_PAD, -1, extra);
+
+                       printer(BINARY_PRINT_SEP, i, extra);
+                       for (j = i & ~mask; j <= i; j++)
+                               printer(BINARY_PRINT_CHAR_DATA, data[j], extra);
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_CHAR_PAD, i, extra);
+                       printer(BINARY_PRINT_LINE_END, -1, extra);
+               }
+       }
+       printer(BINARY_PRINT_DATA_END, -1, extra);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index fe915e616f9b65388e15be963d0ef5cf58c325fd..d0d50cef8b2af31703c7d298f68d93b68326c3c5 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -82,6 +82,8 @@
  
  extern const char *graph_line;
  extern const char *graph_dotted_line;
+extern const char *spaces;
+extern const char *dots;
  extern char buildid_dir[];
  
  /* On most systems <limits.h> would have given us this, but
@@ -303,7 +305,6 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
                   bool show_sym, bool unwind_inlines);
  void free_srcline(char *srcline);
  
-int filename__read_str(const char *filename, char **buf, size_t *sizep);
  int perf_event_paranoid(void);
  
  void mem_bswap_64(void *src, int byte_size);
@@ -343,5 +344,27 @@ int fetch_kernel_version(unsigned int *puint,
  #define KVER_PARAM(x)  KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x)
  
  const char *perf_tip(const char *dirpath);
+bool is_regular_file(const char *file);
+int fetch_current_timestamp(char *buf, size_t sz);
+
+enum binary_printer_ops {
+       BINARY_PRINT_DATA_BEGIN,
+       BINARY_PRINT_LINE_BEGIN,
+       BINARY_PRINT_ADDR,
+       BINARY_PRINT_NUM_DATA,
+       BINARY_PRINT_NUM_PAD,
+       BINARY_PRINT_SEP,
+       BINARY_PRINT_CHAR_DATA,
+       BINARY_PRINT_CHAR_PAD,
+       BINARY_PRINT_LINE_END,
+       BINARY_PRINT_DATA_END,
+};
+
+typedef void (*print_binary_t)(enum binary_printer_ops,
+                              unsigned int val,
+                              void *extra);
  
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra);
  #endif /* GIT_COMPAT_UTIL_H */
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c

index 0dac7e05a6ac9e5f1500eca1cebbce2d900ab511..3fa94e291d16a2891b3799797c1a8da289d89b20 100644 (file)
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1970,7 +1970,7 @@ int has_config_tdp(unsigned int family, unsigned int model)
  }
  
  static void
-dump_cstate_pstate_config_info(family, model)
+dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
  {
         if (!do_nhm_platform_info)
                 return;
@@ -2142,7 +2142,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
  #define        RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
  #define        RAPL_TIME_GRANULARITY   0x3F /* 6 bit time granularity */
  
-double get_tdp(model)
+double get_tdp(unsigned int model)
  {
         unsigned long long msr;
  
@@ -2256,7 +2256,7 @@ void rapl_probe(unsigned int family, unsigned int model)
         return;
  }
  
-void perf_limit_reasons_probe(family, model)
+void perf_limit_reasons_probe(unsigned int family, unsigned int model)
  {
         if (!genuine_intel)
                 return;
@@ -2792,7 +2792,7 @@ void process_cpuid()
         perf_limit_reasons_probe(family, model);
  
         if (debug)
-               dump_cstate_pstate_config_info();
+               dump_cstate_pstate_config_info(family, model);
  
         if (has_skl_msrs(family, model))
                 calculate_tsc_tweak();
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile

index d0c473f6585058053d83ea2e6a3eee46321668a1..d5ce7d7aae3e41f22d276d39aa365a9e5faa9910 100644 (file)
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,15 +4,16 @@ include ../lib.mk
  
  .PHONY: all all_32 all_64 warn_32bit_failure clean
  
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \
+                       check_initial_reg_state sigreturn ldt_gdt
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
                         test_FCMOV test_FCOMI test_FISTTP \
-                       ldt_gdt \
                         vdso_restorer
  
  TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
  BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
-BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
+BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
  
  CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
  
@@ -40,7 +41,7 @@ clean:
  $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
         $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
  
-$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
+$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
         $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
  
  # x86_64 users should be encouraged to install 32-bit libraries
@@ -65,3 +66,9 @@ endif
  sysret_ss_attrs_64: thunks.S
  ptrace_syscall_32: raw_syscall_helper_32.S
  test_syscall_vdso_32: thunks_32.S
+
+# check_initial_reg_state is special: it needs a custom entry, and it
+# needs to be static so that its interpreter doesn't destroy its initial
+# state.
+check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
+check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c

new file mode 100644 (file)

index 0000000..6aaed9b
--- /dev/null
+++ b/tools/testing/selftests/x86/check_initial_reg_state.c
@@ -0,0 +1,109 @@
+/*
+ * check_initial_reg_state.c - check that execve sets the correct state
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+
+unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
+unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
+
+asm (
+       ".pushsection .text\n\t"
+       ".type real_start, @function\n\t"
+       ".global real_start\n\t"
+       "real_start:\n\t"
+#ifdef __x86_64__
+       "mov %rax, ax\n\t"
+       "mov %rbx, bx\n\t"
+       "mov %rcx, cx\n\t"
+       "mov %rdx, dx\n\t"
+       "mov %rsi, si\n\t"
+       "mov %rdi, di\n\t"
+       "mov %rbp, bp\n\t"
+       "mov %rsp, sp\n\t"
+       "mov %r8, r8\n\t"
+       "mov %r9, r9\n\t"
+       "mov %r10, r10\n\t"
+       "mov %r11, r11\n\t"
+       "mov %r12, r12\n\t"
+       "mov %r13, r13\n\t"
+       "mov %r14, r14\n\t"
+       "mov %r15, r15\n\t"
+       "pushfq\n\t"
+       "popq flags\n\t"
+#else
+       "mov %eax, ax\n\t"
+       "mov %ebx, bx\n\t"
+       "mov %ecx, cx\n\t"
+       "mov %edx, dx\n\t"
+       "mov %esi, si\n\t"
+       "mov %edi, di\n\t"
+       "mov %ebp, bp\n\t"
+       "mov %esp, sp\n\t"
+       "pushfl\n\t"
+       "popl flags\n\t"
+#endif
+       "jmp _start\n\t"
+       ".size real_start, . - real_start\n\t"
+       ".popsection");
+
+int main()
+{
+       int nerrs = 0;
+
+       if (sp == 0) {
+               printf("[FAIL]\tTest was built incorrectly\n");
+               return 1;
+       }
+
+       if (ax || bx || cx || dx || si || di || bp
+#ifdef __x86_64__
+           || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
+#endif
+               ) {
+               printf("[FAIL]\tAll GPRs except SP should be 0\n");
+#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
+               SHOW(ax);
+               SHOW(bx);
+               SHOW(cx);
+               SHOW(dx);
+               SHOW(si);
+               SHOW(di);
+               SHOW(bp);
+               SHOW(sp);
+#ifdef __x86_64__
+               SHOW(r8);
+               SHOW(r9);
+               SHOW(r10);
+               SHOW(r11);
+               SHOW(r12);
+               SHOW(r13);
+               SHOW(r14);
+               SHOW(r15);
+#endif
+               nerrs++;
+       } else {
+               printf("[OK]\tAll GPRs except SP are 0\n");
+       }
+
+       if (flags != 0x202) {
+               printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
+               nerrs++;
+       } else {
+               printf("[OK]\tFLAGS is 0x202\n");
+       }
+
+       return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c

index 5105b49cd8aa560d78d0e52ec7d12700bfbf83e7..421456784bc64d7c2ad5b14753194bcf89905d1d 100644 (file)
--- a/tools/testing/selftests/x86/ptrace_syscall.c
+++ b/tools/testing/selftests/x86/ptrace_syscall.c
@@ -103,6 +103,17 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
                 err(1, "sigaction");
  }
  
+static void setsigign(int sig, int flags)
+{
+       struct sigaction sa;
+       memset(&sa, 0, sizeof(sa));
+       sa.sa_sigaction = (void *)SIG_IGN;
+       sa.sa_flags = flags;
+       sigemptyset(&sa.sa_mask);
+       if (sigaction(sig, &sa, 0))
+               err(1, "sigaction");
+}
+
  static void clearhandler(int sig)
  {
         struct sigaction sa;
@@ -187,7 +198,7 @@ static void test_ptrace_syscall_restart(void)
  
         printf("[RUN]\tSYSEMU\n");
         if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-               err(1, "PTRACE_SYSCALL");
+               err(1, "PTRACE_SYSEMU");
         wait_trap(chld);
  
         if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -218,7 +229,7 @@ static void test_ptrace_syscall_restart(void)
                 err(1, "PTRACE_SETREGS");
  
         if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-               err(1, "PTRACE_SYSCALL");
+               err(1, "PTRACE_SYSEMU");
         wait_trap(chld);
  
         if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -250,7 +261,7 @@ static void test_ptrace_syscall_restart(void)
                 err(1, "PTRACE_SETREGS");
  
         if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-               err(1, "PTRACE_SYSCALL");
+               err(1, "PTRACE_SYSEMU");
         wait_trap(chld);
  
         if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -277,6 +288,119 @@ static void test_ptrace_syscall_restart(void)
         }
  }
  
+static void test_restart_under_ptrace(void)
+{
+       printf("[RUN]\tkernel syscall restart under ptrace\n");
+       pid_t chld = fork();
+       if (chld < 0)
+               err(1, "fork");
+
+       if (chld == 0) {
+               if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+                       err(1, "PTRACE_TRACEME");
+
+               printf("\tChild will take a nap until signaled\n");
+               setsigign(SIGUSR1, SA_RESTART);
+               raise(SIGSTOP);
+
+               syscall(SYS_pause, 0, 0, 0, 0, 0, 0);
+               _exit(0);
+       }
+
+       int status;
+
+       /* Wait for SIGSTOP. */
+       if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+               err(1, "waitpid");
+
+       struct user_regs_struct regs;
+
+       printf("[RUN]\tSYSCALL\n");
+       if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+               err(1, "PTRACE_SYSCALL");
+       wait_trap(chld);
+
+       /* We should be stopped at pause(2) entry. */
+
+       if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_GETREGS");
+
+       if (regs.user_syscall_nr != SYS_pause ||
+           regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+           regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+           regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+               printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+               nerrs++;
+       } else {
+               printf("[OK]\tInitial nr and args are correct\n");
+       }
+
+       /* Interrupt it. */
+       kill(chld, SIGUSR1);
+
+       /* Advance.  We should be stopped at exit. */
+       printf("[RUN]\tSYSCALL\n");
+       if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+               err(1, "PTRACE_SYSCALL");
+       wait_trap(chld);
+
+       if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_GETREGS");
+
+       if (regs.user_syscall_nr != SYS_pause ||
+           regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+           regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+           regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+               printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+               nerrs++;
+       } else {
+               printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n",
+                      (long)regs.user_ax);
+       }
+
+       /* Poke the regs back in.  This must not break anything. */
+       if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_SETREGS");
+
+       /* Catch the (ignored) SIGUSR1. */
+       if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+               err(1, "PTRACE_CONT");
+       if (waitpid(chld, &status, 0) != chld)
+               err(1, "waitpid");
+       if (!WIFSTOPPED(status)) {
+               printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status);
+               nerrs++;
+       } else {
+               printf("[OK]\tChild got SIGUSR1\n");
+       }
+
+       /* The next event should be pause(2) again. */
+       printf("[RUN]\tStep again\n");
+       if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+               err(1, "PTRACE_SYSCALL");
+       wait_trap(chld);
+
+       /* We should be stopped at pause(2) entry. */
+
+       if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_GETREGS");
+
+       if (regs.user_syscall_nr != SYS_pause ||
+           regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+           regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+           regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+               printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+               nerrs++;
+       } else {
+               printf("[OK]\tpause(2) restarted correctly\n");
+       }
+
+       /* Kill it. */
+       kill(chld, SIGKILL);
+       if (waitpid(chld, &status, 0) != chld)
+               err(1, "waitpid");
+}
+
  int main()
  {
         printf("[RUN]\tCheck int80 return regs\n");
@@ -290,5 +414,7 @@ int main()
  
         test_ptrace_syscall_restart();
  
+       test_restart_under_ptrace();
+
         return 0;
  }
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c

index b5aa1bab7416394918fd59a441b0bda3853cfd28..8a577e7070c64104d74aa1e4361cfe8709894e25 100644 (file)
--- a/tools/testing/selftests/x86/sigreturn.c
+++ b/tools/testing/selftests/x86/sigreturn.c
@@ -54,6 +54,37 @@
  #include <sys/ptrace.h>
  #include <sys/user.h>
  
+/* Pull in AR_xyz defines. */
+typedef unsigned int u32;
+typedef unsigned short u16;
+#include "../../../../arch/x86/include/asm/desc_defs.h"
+
+/*
+ * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc
+ * headers.
+ */
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ */
+#define UC_SIGCONTEXT_SS       0x2
+#define UC_STRICT_RESTORE_SS   0x4
+#endif
+
  /*
   * In principle, this test can run on Linux emulation layers (e.g.
   * Illumos "LX branded zones").  Solaris-based kernels reserve LDT
@@ -267,6 +298,9 @@ static gregset_t initial_regs, requested_regs, resulting_regs;
  /* Instructions for the SIGUSR1 handler. */
  static volatile unsigned short sig_cs, sig_ss;
  static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
+#ifdef __x86_64__
+static volatile sig_atomic_t sig_corrupt_final_ss;
+#endif
  
  /* Abstractions for some 32-bit vs 64-bit differences. */
  #ifdef __x86_64__
@@ -305,9 +339,105 @@ static greg_t *csptr(ucontext_t *ctx)
  }
  #endif
  
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+int cs_bitness(unsigned short cs)
+{
+       uint32_t valid = 0, ar;
+       asm ("lar %[cs], %[ar]\n\t"
+            "jnz 1f\n\t"
+            "mov $1, %[valid]\n\t"
+            "1:"
+            : [ar] "=r" (ar), [valid] "+rm" (valid)
+            : [cs] "r" (cs));
+
+       if (!valid)
+               return -1;
+
+       bool db = (ar & (1 << 22));
+       bool l = (ar & (1 << 21));
+
+       if (!(ar & (1<<11)))
+           return -1;  /* Not code. */
+
+       if (l && !db)
+               return 64;
+       else if (!l && db)
+               return 32;
+       else if (!l && !db)
+               return 16;
+       else
+               return -1;      /* Unknown bitness. */
+}
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+bool is_valid_ss(unsigned short cs)
+{
+       uint32_t valid = 0, ar;
+       asm ("lar %[cs], %[ar]\n\t"
+            "jnz 1f\n\t"
+            "mov $1, %[valid]\n\t"
+            "1:"
+            : [ar] "=r" (ar), [valid] "+rm" (valid)
+            : [cs] "r" (cs));
+
+       if (!valid)
+               return false;
+
+       if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA &&
+           (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN)
+               return false;
+
+       return (ar & AR_P);
+}
+
  /* Number of errors in the current test case. */
  static volatile sig_atomic_t nerrs;
  
+static void validate_signal_ss(int sig, ucontext_t *ctx)
+{
+#ifdef __x86_64__
+       bool was_64bit = (cs_bitness(*csptr(ctx)) == 64);
+
+       if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) {
+               printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n");
+               nerrs++;
+
+               /*
+                * This happens on Linux 4.1.  The rest will fail, too, so
+                * return now to reduce the noise.
+                */
+               return;
+       }
+
+       /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */
+       if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) {
+               printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n",
+                      sig);
+               nerrs++;
+       }
+
+       if (is_valid_ss(*ssptr(ctx))) {
+               /*
+                * DOSEMU was written before 64-bit sigcontext had SS, and
+                * it tries to figure out the signal source SS by looking at
+                * the physical register.  Make sure that keeps working.
+                */
+               unsigned short hw_ss;
+               asm ("mov %%ss, %0" : "=rm" (hw_ss));
+               if (hw_ss != *ssptr(ctx)) {
+                       printf("[FAIL]\tHW SS didn't match saved SS\n");
+                       nerrs++;
+               }
+       }
+#endif
+}
+
  /*
   * SIGUSR1 handler.  Sets CS and SS as requested and points IP to the
   * int3 trampoline.  Sets SP to a large known value so that we can see
@@ -317,6 +447,8 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
  {
         ucontext_t *ctx = (ucontext_t*)ctx_void;
  
+       validate_signal_ss(sig, ctx);
+
         memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
  
         *csptr(ctx) = sig_cs;
@@ -334,13 +466,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
  }
  
  /*
- * Called after a successful sigreturn.  Restores our state so that
- * the original raise(SIGUSR1) returns.
+ * Called after a successful sigreturn (via int3) or from a failed
+ * sigreturn (directly by kernel).  Restores our state so that the
+ * original raise(SIGUSR1) returns.
   */
  static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
  {
         ucontext_t *ctx = (ucontext_t*)ctx_void;
  
+       validate_signal_ss(sig, ctx);
+
         sig_err = ctx->uc_mcontext.gregs[REG_ERR];
         sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
  
@@ -358,41 +493,62 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
         memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
         memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
  
+#ifdef __x86_64__
+       if (sig_corrupt_final_ss) {
+               if (ctx->uc_flags & UC_STRICT_RESTORE_SS) {
+                       printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n");
+                       nerrs++;
+               } else {
+                       /*
+                        * DOSEMU transitions from 32-bit to 64-bit mode by
+                        * adjusting sigcontext, and it requires that this work
+                        * even if the saved SS is bogus.
+                        */
+                       printf("\tCorrupting SS on return to 64-bit mode\n");
+                       *ssptr(ctx) = 0;
+               }
+       }
+#endif
+
         sig_trapped = sig;
  }
  
-/*
- * Checks a given selector for its code bitness or returns -1 if it's not
- * a usable code segment selector.
- */
-int cs_bitness(unsigned short cs)
+#ifdef __x86_64__
+/* Tests recovery if !UC_STRICT_RESTORE_SS */
+static void sigusr2(int sig, siginfo_t *info, void *ctx_void)
  {
-       uint32_t valid = 0, ar;
-       asm ("lar %[cs], %[ar]\n\t"
-            "jnz 1f\n\t"
-            "mov $1, %[valid]\n\t"
-            "1:"
-            : [ar] "=r" (ar), [valid] "+rm" (valid)
-            : [cs] "r" (cs));
+       ucontext_t *ctx = (ucontext_t*)ctx_void;
  
-       if (!valid)
-               return -1;
+       if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) {
+               printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n");
+               nerrs++;
+               return;  /* We can't do the rest. */
+       }
  
-       bool db = (ar & (1 << 22));
-       bool l = (ar & (1 << 21));
+       ctx->uc_flags &= ~UC_STRICT_RESTORE_SS;
+       *ssptr(ctx) = 0;
  
-       if (!(ar & (1<<11)))
-           return -1;  /* Not code. */
+       /* Return.  The kernel should recover without sending another signal. */
+}
  
-       if (l && !db)
-               return 64;
-       else if (!l && db)
-               return 32;
-       else if (!l && !db)
-               return 16;
-       else
-               return -1;      /* Unknown bitness. */
+static int test_nonstrict_ss(void)
+{
+       clearhandler(SIGUSR1);
+       clearhandler(SIGTRAP);
+       clearhandler(SIGSEGV);
+       clearhandler(SIGILL);
+       sethandler(SIGUSR2, sigusr2, 0);
+
+       nerrs = 0;
+
+       printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n");
+       raise(SIGUSR2);
+       if (!nerrs)
+               printf("[OK]\tIt worked\n");
+
+       return nerrs;
  }
+#endif
  
  /* Finds a usable code segment of the requested bitness. */
  int find_cs(int bitness)
@@ -576,6 +732,12 @@ static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
                        errdesc, strsignal(sig_trapped));
                 return 0;
         } else {
+               /*
+                * This also implicitly tests UC_STRICT_RESTORE_SS:
+                * We check that these signals set UC_STRICT_RESTORE_SS and,
+                * if UC_STRICT_RESTORE_SS doesn't cause strict behavior,
+                * then we won't get SIGSEGV.
+                */
                 printf("[FAIL]\tDid not get SIGSEGV\n");
                 return 1;
         }
@@ -632,6 +794,14 @@ int main()
                                                     GDT3(gdt_data16_idx));
         }
  
+#ifdef __x86_64__
+       /* Nasty ABI case: check SS corruption handling. */
+       sig_corrupt_final_ss = 1;
+       total_nerrs += test_valid_sigreturn(32, false, -1);
+       total_nerrs += test_valid_sigreturn(32, true, -1);
+       sig_corrupt_final_ss = 0;
+#endif
+
         /*
          * We're done testing valid sigreturn cases.  Now we test states
          * for which sigreturn itself will succeed but the subsequent
@@ -680,5 +850,9 @@ int main()
         if (gdt_npdata32_idx)
                 test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
  
+#ifdef __x86_64__
+       total_nerrs += test_nonstrict_ss();
+#endif
+
         return total_nerrs ? 1 : 0;
  }
diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c

index 60c06af4646a3de0034820a65d805d0be16307ab..43fcab367fb0a243c2b96703f3ef91bc323aa806 100644 (file)
--- a/tools/testing/selftests/x86/syscall_nt.c
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -17,6 +17,9 @@
  
  #include <stdio.h>
  #include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <err.h>
  #include <sys/syscall.h>
  #include <asm/processor-flags.h>
  
@@ -26,6 +29,8 @@
  # define WIDTH "l"
  #endif
  
+static unsigned int nerrs;
+
  static unsigned long get_eflags(void)
  {
         unsigned long eflags;
@@ -39,16 +44,52 @@ static void set_eflags(unsigned long eflags)
                       : : "rm" (eflags) : "flags");
  }
  
-int main()
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+                      int flags)
  {
-       printf("[RUN]\tSet NT and issue a syscall\n");
-       set_eflags(get_eflags() | X86_EFLAGS_NT);
+       struct sigaction sa;
+       memset(&sa, 0, sizeof(sa));
+       sa.sa_sigaction = handler;
+       sa.sa_flags = SA_SIGINFO | flags;
+       sigemptyset(&sa.sa_mask);
+       if (sigaction(sig, &sa, 0))
+               err(1, "sigaction");
+}
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void do_it(unsigned long extraflags)
+{
+       unsigned long flags;
+
+       set_eflags(get_eflags() | extraflags);
         syscall(SYS_getpid);
-       if (get_eflags() & X86_EFLAGS_NT) {
-               printf("[OK]\tThe syscall worked and NT is still set\n");
-               return 0;
+       flags = get_eflags();
+       if ((flags & extraflags) == extraflags) {
+               printf("[OK]\tThe syscall worked and flags are still set\n");
         } else {
-               printf("[FAIL]\tThe syscall worked but NT was cleared\n");
-               return 1;
+               printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n",
+                      flags, extraflags);
+               nerrs++;
         }
  }
+
+int main(void)
+{
+       printf("[RUN]\tSet NT and issue a syscall\n");
+       do_it(X86_EFLAGS_NT);
+
+       /*
+        * Now try it again with TF set -- TF forces returns via IRET in all
+        * cases except non-ptregs-using 64-bit full fast path syscalls.
+        */
+
+       sethandler(SIGTRAP, sigtrap, 0);
+
+       printf("[RUN]\tSet NT|TF and issue a syscall\n");
+       do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
+
+       return nerrs == 0 ? 0 : 1;
+}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c

index db2dd3335c6a6de0c2f0dd9fe89e462d16f5ba81..65da997b430a8e8c2b3d8eb9de93fb6b42d00f80 100644 (file)
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
          * This memory barrier pairs with prepare_to_wait's set_current_state()
          */
         smp_mb();
-       if (waitqueue_active(&vcpu->wq))
-               wake_up_interruptible(&vcpu->wq);
+       if (swait_active(&vcpu->wq))
+               swake_up(&vcpu->wq);
  
         mmput(mm);
         kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index a11cfd20a6a0d2aa86b1d06b8552bc41a8bfd8c9..5af50c3ddd535a5094b8de788bd88cc17085fb17 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
         vcpu->kvm = kvm;
         vcpu->vcpu_id = id;
         vcpu->pid = NULL;
-       vcpu->halt_poll_ns = 0;
-       init_waitqueue_head(&vcpu->wq);
+       init_swait_queue_head(&vcpu->wq);
         kvm_async_pf_vcpu_init(vcpu);
  
         vcpu->pre_pcpu = -1;
@@ -1952,6 +1951,9 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
         else
                 val *= halt_poll_ns_grow;
  
+       if (val > halt_poll_ns)
+               val = halt_poll_ns;
+
         vcpu->halt_poll_ns = val;
         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
  }
@@ -1990,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  {
         ktime_t start, cur;
-       DEFINE_WAIT(wait);
+       DECLARE_SWAITQUEUE(wait);
         bool waited = false;
         u64 block_ns;
  
@@ -2015,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
         kvm_arch_vcpu_blocking(vcpu);
  
         for (;;) {
-               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  
                 if (kvm_vcpu_check_block(vcpu) < 0)
                         break;
@@ -2024,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                 schedule();
         }
  
-       finish_wait(&vcpu->wq, &wait);
+       finish_swait(&vcpu->wq, &wait);
         cur = ktime_get();
  
         kvm_arch_vcpu_unblocking(vcpu);
@@ -2056,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
  {
         int me;
         int cpu = vcpu->cpu;
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
  
         wqp = kvm_arch_vcpu_wq(vcpu);
-       if (waitqueue_active(wqp)) {
-               wake_up_interruptible(wqp);
+       if (swait_active(wqp)) {
+               swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
  
@@ -2161,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                 continue;
                         if (vcpu == me)
                                 continue;
-                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                 continue;
                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                 continue;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Mar 2016 17:16:48 +0000 (10:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Mar 2016 17:16:48 +0000 (10:16 -0700)